ICD 10 XML to CSV

Source: https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2024-Update/

import pandas as pd
import re
with open("icd10cm-tabular-April-2024.xml", 'r', encoding='utf-8') as file:
    content = file.read()

pattern = re.compile(r'<name>(.*?)<\/name>.*?<desc>(.*?)<\/desc>', re.DOTALL)
matches = pattern.findall(content)
df = pd.DataFrame(matches, columns=['Code', 'Description'])
df.head()
Code Description
0 A00 Cholera
1 A00.0 Cholera due to Vibrio cholerae 01, biovar chol...
2 A00.1 Cholera due to Vibrio cholerae 01, biovar eltor
3 A00.9 Cholera, unspecified
4 A01 Typhoid and paratyphoid fevers
df.to_csv("icd-10-2024-04.csv", encoding='utf-8', index=False, sep="\t")  # has to be TSV because the values contain commas