`CellMarker`: cellmarker; 2.0¶

The underlying curation process for bionty.CellMarker.df()

import pandas as pd

url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
df_cm = pd.read_excel(url, dtype=str)

import unicodedata

greek_to_letter = {}
greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
for i in greek_alphabet:
    if unicodedata.name(i).startswith("GREEK "):
        greek_to_letter[i] = unicodedata.name(i).split(" ")[-1]


def greek2letter(string: str):
    """α -> ALPHA."""
    for k, v in greek_to_letter.items():
        string = string.replace(k, v)
    return string


def greek2latin(string: str):
    """α -> a."""
    greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
    latin_alphabet = "AaBbGgDdEeZzHhJjIiKkLlMmNnXxOoPpRrSssTtUuFfQqYyWw"
    greek2latin = str.maketrans(greek_alphabet, latin_alphabet)
    return string.translate(greek2latin)


letter_to_latin = {}
for k, v in greek_to_letter.items():
    letter_to_latin[v] = greek2latin(k)


def letter2latin(string: str):
    """ALPHA -> a."""
    for k, v in letter_to_latin.items():
        string = string.replace(k, v).replace(k.lower(), v).replace(k.capitalize(), v)
    return string


def remove_case_insensitive_dup(myList: list):
    result = []

    marker = set()

    for l in myList:
        ll = l.lower()
        if ll not in marker:  # test presence
            marker.add(ll)
            result.append(l)  # preserve order
    return result

def preprocess(df_cm: pd.DataFrame, species: str):
    df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
    print(f"Original shape: {df_cm.shape}")
    display(df_cm.head())

    # Drop tissue, cell type, tech, journal, Genename
    df = df_cm[["marker", "GeneID", "Symbol", "UNIPROTID"]].copy()
    # Remove duplications
    df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
    df = df.drop_duplicates(subset=["marker"])
    df.rename(
        columns={
            "marker": "name",
            "GeneID": "ncbi_gene_id",
            "Symbol": "gene_symbol",
            "UNIPROTID": "uniprotkb_id",
        },
        inplace=True,
    )
    df = df[df["name"].notnull()].copy()
    print(f"Unique shape: {df.shape}")
    display(df.head())

    # Aggregate on a no-, space latin name
    df.rename(columns={"name": "orig_name"}, inplace=True)
    df["synonyms"] = ""
    df["agg"] = ""
    for _, row in df.iterrows():
        n = row["orig_name"]
        # remove ' ' and ' ' of the name
        syns = set([n, n.replace(" ", "").replace("-", "").replace("‐", "")])
        if "-" in n:
            syns.update([n.replace("-", " "), n.replace("-", "")])
        if " " in n:
            syns.update([n.replace(" ", "-"), n.replace(" ", "")])

        # convert greek symbols to latin and letters
        greeks = (
            set([greek2letter(i) for i in syns])
            .union(set([greek2latin(i) for i in syns]))
            .union(set([letter2latin(i) for i in syns]))
        )

        # if contains greek symbols, the name must be latin
        if greek2latin(n) != n:
            row["agg"] = min(set([greek2latin(i) for i in syns]), key=len)
        else:
            # use the shortest syns as the name
            row["agg"] = min(syns.union(greeks), key=len)

        row["agg"] = row["agg"].upper()
        row["synonyms"] = list(syns.union(greeks))

    # aggregate all synonyms
    df_group = df.groupby("agg").agg(
        {
            "ncbi_gene_id": "first",
            "gene_symbol": "first",
            "uniprotkb_id": "first",
            "synonyms": "sum",
            "orig_name": list,
        }
    )

    # remove the synonyms that are only case different from name
    # use the shortest non-greek original name as the new name
    df_group["name"] = ""
    for _, row in df_group.iterrows():
        orig_names = set(row["orig_name"])
        shortest_orig_name = min(orig_names, key=len)
        if greek2latin(shortest_orig_name) != shortest_orig_name:
            shortest_orig_name = greek2latin(shortest_orig_name)
        else:
            orig_names.remove(shortest_orig_name)

        syns = orig_names.union(row["synonyms"])
        syns = {i for i in syns if i.lower() != shortest_orig_name.lower()}

        syns = remove_case_insensitive_dup(syns)

        row["synonyms"] = "|".join(syns)
        row["name"] = shortest_orig_name

    df_group.reset_index(drop=True, inplace=True)
    df_group = df_group[
        ["name", "synonyms", "gene_symbol", "ncbi_gene_id", "uniprotkb_id"]
    ]
    df_group = df_group.sort_values("gene_symbol")
    df_group = df_group.set_index("name")

    print(f"Final shape: {df_group.shape}")
    display(df_group.head())

    return df_group

Human¶

df_human = preprocess(df_cm, species="human")

Original shape: (60877, 20)

	species	tissue_class	tissue_type	uberonongology_id	cancer_type	cell_type	cell_name	cellontology_id	marker	Symbol	GeneID	Genetype	Genename	UNIPROTID	technology_seq	marker_source	PMID	Title	journal	year
0	Human	Abdomen	Abdomen	UBERON_0000916	Normal	Normal cell	Macrophage	CL_0000235	MERTK	MERTK	10461	protein_coding	MER proto-oncogene, tyrosine kinase	Q12866	None	Experiment	31982413	Peritoneal Level of CD206 Associates With Mort...	Gastroenterology	2020
1	Human	Abdomen	Abdomen	UBERON_0000916	Normal	Normal cell	Macrophage	CL_0000235	CD16	FCGR3A	2215	protein_coding	Fc fragment of IgG receptor IIIb	O75015	None	Experiment	31982413	Peritoneal Level of CD206 Associates With Mort...	Gastroenterology	2020
2	Human	Abdomen	Abdomen	UBERON_0000916	Normal	Normal cell	Macrophage	CL_0000235	CD206	MRC1	4360	protein_coding	mannose receptor C-type 1	P22897	None	Experiment	31982413	Peritoneal Level of CD206 Associates With Mort...	Gastroenterology	2020
3	Human	Abdomen	Abdomen	UBERON_0000916	Normal	Normal cell	Macrophage	CL_0000235	CRIg	VSIG4	11326	protein_coding	V-set and immunoglobulin domain containing 4	Q9Y279	None	Experiment	31982413	Peritoneal Level of CD206 Associates With Mort...	Gastroenterology	2020
4	Human	Abdomen	Abdomen	UBERON_0000916	Normal	Normal cell	Macrophage	CL_0000235	CD163	CD163	9332	protein_coding	CD163 molecule	Q86VB7	None	Experiment	31982413	Peritoneal Level of CD206 Associates With Mort...	Gastroenterology	2020

Unique shape: (16679, 4)

	name	ncbi_gene_id	gene_symbol	uniprotkb_id
0	MERTK	10461	MERTK	Q12866
1	CD16	2215	FCGR3A	O75015
2	CD206	4360	MRC1	P22897
3	CRIg	11326	VSIG4	Q9Y279
4	CD163	9332	CD163	Q86VB7

Final shape: (15466, 4)

	synonyms	gene_symbol	ncbi_gene_id	uniprotkb_id
name
A1BG		A1BG	1	P04217
A2M		A2M	3494	None
A2ML1		A2ML1	144568	A8K2U0
A4GALT		A4GALT	53947	A0A0S2Z5J1
AADAC		AADAC	13	P22760

# confirm that no synonym is attached to multiple entries

exp = pd.DataFrame(df_human["synonyms"].str.split("|"))
exp = exp.explode("synonyms")
exp = exp[exp["synonyms"].apply(len) > 0]
exp[exp["synonyms"].duplicated(keep=False)]

	synonyms
name

df_human.to_parquet("df_human__cellmarker__2.0__CellMarker.parquet")

Mouse¶

df_mouse = preprocess(df_cm, species="mouse")

Original shape: (35197, 20)

	species	tissue_class	tissue_type	uberonongology_id	cancer_type	cell_type	cell_name	cellontology_id	marker	Symbol	GeneID	Genetype	Genename	UNIPROTID	technology_seq	marker_source	PMID	Title	journal	year
8	Mouse	Abdomen	Muscle	UBERON_0001630	Normal	Normal cell	Fibro-adipogenic progenitor cell	NaN	Wisp1	Ccn4	22402	protein_coding	cellular communication network factor 4	O54775	10x Chromium	Experiment	35439171	An estrogen-sensitive fibroblast population dr...	JCI insight	2022
9	Mouse	Abdomen	Muscle	UBERON_0001630	Normal	Normal cell	Myoblast	CL_0000056	Myod1	Myod1	17927	protein_coding	myogenic differentiation 1	P10085	10x Chromium	Experiment	35439171	An estrogen-sensitive fibroblast population dr...	JCI insight	2022
10	Mouse	Abdomen	Muscle	UBERON_0001630	Normal	Normal cell	Muscle satellite cell	CL_0000514	Myf5	Myf5	17877	protein_coding	myogenic factor 5	A2RSK4	10x Chromium	Experiment	35439171	An estrogen-sensitive fibroblast population dr...	JCI insight	2022
11	Mouse	Abdomen	Muscle	UBERON_0001630	Normal	Normal cell	Myocyte	CL_0000187	Ckm	Ckm	12715	protein_coding	creatine kinase, muscle	A2RTA0	10x Chromium	Experiment	35439171	An estrogen-sensitive fibroblast population dr...	JCI insight	2022
12	Mouse	Abdomen	Muscle	UBERON_0001630	Normal	Normal cell	Myocyte	CL_0000187	Acta1	Acta1	11459	protein_coding	actin alpha 1, skeletal muscle	P68134	10x Chromium	Experiment	35439171	An estrogen-sensitive fibroblast population dr...	JCI insight	2022

Unique shape: (12503, 4)

	name	ncbi_gene_id	gene_symbol	uniprotkb_id
8	Wisp1	22402	Ccn4	O54775
9	Myod1	17927	Myod1	P10085
10	Myf5	17877	Myf5	A2RSK4
11	Ckm	12715	Ckm	A2RTA0
12	Acta1	11459	Acta1	P68134

Final shape: (11206, 4)

	synonyms	gene_symbol	ncbi_gene_id	uniprotkb_id
name
0610010K14Rik		0610010K14Rik	104457	D3Z687
0610030E20Rik		0610030E20Rik	68364	Q149G0
0610040J01Rik		0610040J01Rik	76261	Q99K99
1110017D15Rik		1110017D15Rik	73721	Q2MH31
1110032A03Rik		1110032A03Rik	68721	Q9D131

df_mouse.to_parquet("df_mouse__cellmarker__2.0__CellMarker.parquet")

previous

Gene: ensembl, release-112

next

Protein: uniprot, 2023-03