ICD 10 XML to CSV

Source: https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Publications/ICD10CM/2024-Update/

import pandas as pd
import re
with open("icd10cm-tabular-April-2024.xml", encoding="utf-8") as file:
    content = file.read()

pattern = re.compile(r"<name>(.*?)<\/name>.*?<desc>(.*?)<\/desc>", re.DOTALL)
matches = pattern.findall(content)
df = pd.DataFrame(matches, columns=["Code", "Description"])
df.head()
Code Description
0 A00 Cholera
1 A00.0 Cholera due to Vibrio cholerae 01, biovar chol...
2 A00.1 Cholera due to Vibrio cholerae 01, biovar eltor
3 A00.9 Cholera, unspecified
4 A01 Typhoid and paratyphoid fevers
df.to_csv(
    "icd-10-2024-04.csv", encoding="utf-8", index=False, sep="\t"
)  # has to be TSV because the values contain commas