CellMarker: cellmarker; 2.0

The underlying curation process for bionty.CellMarker.df()

import pandas as pd
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
df_cm = pd.read_excel(url, dtype=str)
import unicodedata

greek_to_letter = {}
greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
for i in greek_alphabet:
    if unicodedata.name(i).startswith("GREEK "):
        greek_to_letter[i] = unicodedata.name(i).split(" ")[-1]


def greek2letter(string: str):
    """α -> ALPHA."""
    for k, v in greek_to_letter.items():
        string = string.replace(k, v)
    return string


def greek2latin(string: str):
    """α -> a."""
    greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
    latin_alphabet = "AaBbGgDdEeZzHhJjIiKkLlMmNnXxOoPpRrSssTtUuFfQqYyWw"
    greek2latin = str.maketrans(greek_alphabet, latin_alphabet)
    return string.translate(greek2latin)


letter_to_latin = {}
for k, v in greek_to_letter.items():
    letter_to_latin[v] = greek2latin(k)


def letter2latin(string: str):
    """ALPHA -> a."""
    for k, v in letter_to_latin.items():
        string = string.replace(k, v).replace(k.lower(), v).replace(k.capitalize(), v)
    return string


def remove_case_insensitive_dup(myList: list):
    result = []

    marker = set()

    for l in myList:
        ll = l.lower()
        if ll not in marker:  # test presence
            marker.add(ll)
            result.append(l)  # preserve order
    return result
def preprocess(df_cm: pd.DataFrame, species: str):
    df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
    print(f"Original shape: {df_cm.shape}")
    display(df_cm.head())

    # Drop tissue, cell type, tech, journal, Genename
    df = df_cm[["marker", "GeneID", "Symbol", "UNIPROTID"]].copy()
    # Remove duplications
    df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
    df = df.drop_duplicates(subset=["marker"])
    df.rename(
        columns={
            "marker": "name",
            "GeneID": "ncbi_gene_id",
            "Symbol": "gene_symbol",
            "UNIPROTID": "uniprotkb_id",
        },
        inplace=True,
    )
    df = df[df["name"].notnull()].copy()
    print(f"Unique shape: {df.shape}")
    display(df.head())

    # Aggregate on a no-, space latin name
    df.rename(columns={"name": "orig_name"}, inplace=True)
    df["synonyms"] = ""
    df["agg"] = ""
    for _, row in df.iterrows():
        n = row["orig_name"]
        # remove ' ' and ' ' of the name
        syns = set([n, n.replace(" ", "").replace("-", "").replace("‐", "")])
        if "-" in n:
            syns.update([n.replace("-", " "), n.replace("-", "")])
        if " " in n:
            syns.update([n.replace(" ", "-"), n.replace(" ", "")])

        # convert greek symbols to latin and letters
        greeks = (
            set([greek2letter(i) for i in syns])
            .union(set([greek2latin(i) for i in syns]))
            .union(set([letter2latin(i) for i in syns]))
        )

        # if contains greek symbols, the name must be latin
        if greek2latin(n) != n:
            row["agg"] = min(set([greek2latin(i) for i in syns]), key=len)
        else:
            # use the shortest syns as the name
            row["agg"] = min(syns.union(greeks), key=len)

        row["agg"] = row["agg"].upper()
        row["synonyms"] = list(syns.union(greeks))

    # aggregate all synonyms
    df_group = df.groupby("agg").agg(
        {
            "ncbi_gene_id": "first",
            "gene_symbol": "first",
            "uniprotkb_id": "first",
            "synonyms": "sum",
            "orig_name": list,
        }
    )

    # remove the synonyms that are only case different from name
    # use the shortest non-greek original name as the new name
    df_group["name"] = ""
    for _, row in df_group.iterrows():
        orig_names = set(row["orig_name"])
        shortest_orig_name = min(orig_names, key=len)
        if greek2latin(shortest_orig_name) != shortest_orig_name:
            shortest_orig_name = greek2latin(shortest_orig_name)
        else:
            orig_names.remove(shortest_orig_name)

        syns = orig_names.union(row["synonyms"])
        syns = {i for i in syns if i.lower() != shortest_orig_name.lower()}

        syns = remove_case_insensitive_dup(syns)

        row["synonyms"] = "|".join(syns)
        row["name"] = shortest_orig_name

    df_group.reset_index(drop=True, inplace=True)
    df_group = df_group[
        ["name", "synonyms", "gene_symbol", "ncbi_gene_id", "uniprotkb_id"]
    ]
    df_group = df_group.sort_values("gene_symbol")
    df_group = df_group.set_index("name")

    print(f"Final shape: {df_group.shape}")
    display(df_group.head())

    return df_group

Human

df_human = preprocess(df_cm, species="human")
Original shape: (60877, 20)
species tissue_class tissue_type uberonongology_id cancer_type cell_type cell_name cellontology_id marker Symbol GeneID Genetype Genename UNIPROTID technology_seq marker_source PMID Title journal year
0 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 MERTK MERTK 10461 protein_coding MER proto-oncogene, tyrosine kinase Q12866 None Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
1 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD16 FCGR3A 2215 protein_coding Fc fragment of IgG receptor IIIb O75015 None Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
2 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD206 MRC1 4360 protein_coding mannose receptor C-type 1 P22897 None Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
3 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CRIg VSIG4 11326 protein_coding V-set and immunoglobulin domain containing 4 Q9Y279 None Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
4 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD163 CD163 9332 protein_coding CD163 molecule Q86VB7 None Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
Unique shape: (16679, 4)
name ncbi_gene_id gene_symbol uniprotkb_id
0 MERTK 10461 MERTK Q12866
1 CD16 2215 FCGR3A O75015
2 CD206 4360 MRC1 P22897
3 CRIg 11326 VSIG4 Q9Y279
4 CD163 9332 CD163 Q86VB7
Final shape: (15466, 4)
synonyms gene_symbol ncbi_gene_id uniprotkb_id
name
A1BG A1BG 1 P04217
A2M A2M 3494 None
A2ML1 A2ML1 144568 A8K2U0
A4GALT A4GALT 53947 A0A0S2Z5J1
AADAC AADAC 13 P22760
# confirm that no synonym is attached to multiple entries

exp = pd.DataFrame(df_human["synonyms"].str.split("|"))
exp = exp.explode("synonyms")
exp = exp[exp["synonyms"].apply(len) > 0]
exp[exp["synonyms"].duplicated(keep=False)]
synonyms
name
df_human.to_parquet("df_human__cellmarker__2.0__CellMarker.parquet")

Mouse

df_mouse = preprocess(df_cm, species="mouse")
Original shape: (35197, 20)
species tissue_class tissue_type uberonongology_id cancer_type cell_type cell_name cellontology_id marker Symbol GeneID Genetype Genename UNIPROTID technology_seq marker_source PMID Title journal year
8 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Fibro-adipogenic progenitor cell NaN Wisp1 Ccn4 22402 protein_coding cellular communication network factor 4 O54775 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
9 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myoblast CL_0000056 Myod1 Myod1 17927 protein_coding myogenic differentiation 1 P10085 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
10 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Muscle satellite cell CL_0000514 Myf5 Myf5 17877 protein_coding myogenic factor 5 A2RSK4 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
11 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myocyte CL_0000187 Ckm Ckm 12715 protein_coding creatine kinase, muscle A2RTA0 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
12 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myocyte CL_0000187 Acta1 Acta1 11459 protein_coding actin alpha 1, skeletal muscle P68134 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
Unique shape: (12503, 4)
name ncbi_gene_id gene_symbol uniprotkb_id
8 Wisp1 22402 Ccn4 O54775
9 Myod1 17927 Myod1 P10085
10 Myf5 17877 Myf5 A2RSK4
11 Ckm 12715 Ckm A2RTA0
12 Acta1 11459 Acta1 P68134
Final shape: (11206, 4)
synonyms gene_symbol ncbi_gene_id uniprotkb_id
name
0610010K14Rik 0610010K14Rik 104457 D3Z687
0610030E20Rik 0610030E20Rik 68364 Q149G0
0610040J01Rik 0610040J01Rik 76261 Q99K99
1110017D15Rik 1110017D15Rik 73721 Q2MH31
1110032A03Rik 1110032A03Rik 68721 Q9D131
df_mouse.to_parquet("df_mouse__cellmarker__2.0__CellMarker.parquet")