CellMarker
: cellmarker; 2.0¶
The underlying curation process for bionty.CellMarker.df()
import pandas as pd
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
df_cm = pd.read_excel(url, dtype=str)
import unicodedata
greek_to_letter = {}
greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
for i in greek_alphabet:
if unicodedata.name(i).startswith("GREEK "):
greek_to_letter[i] = unicodedata.name(i).split(" ")[-1]
def greek2letter(string: str):
"""α -> ALPHA."""
for k, v in greek_to_letter.items():
string = string.replace(k, v)
return string
def greek2latin(string: str):
"""α -> a."""
greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
latin_alphabet = "AaBbGgDdEeZzHhJjIiKkLlMmNnXxOoPpRrSssTtUuFfQqYyWw"
greek2latin = str.maketrans(greek_alphabet, latin_alphabet)
return string.translate(greek2latin)
letter_to_latin = {}
for k, v in greek_to_letter.items():
letter_to_latin[v] = greek2latin(k)
def letter2latin(string: str):
"""ALPHA -> a."""
for k, v in letter_to_latin.items():
string = string.replace(k, v).replace(k.lower(), v).replace(k.capitalize(), v)
return string
def remove_case_insensitive_dup(myList: list):
result = []
marker = set()
for l in myList:
ll = l.lower()
if ll not in marker: # test presence
marker.add(ll)
result.append(l) # preserve order
return result
def preprocess(df_cm: pd.DataFrame, species: str):
df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
print(f"Original shape: {df_cm.shape}")
display(df_cm.head())
# Drop tissue, cell type, tech, journal, Genename
df = df_cm[["marker", "GeneID", "Symbol", "UNIPROTID"]].copy()
# Remove duplications
df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
df = df.drop_duplicates(subset=["marker"])
df.rename(
columns={
"marker": "name",
"GeneID": "ncbi_gene_id",
"Symbol": "gene_symbol",
"UNIPROTID": "uniprotkb_id",
},
inplace=True,
)
df = df[df["name"].notnull()].copy()
print(f"Unique shape: {df.shape}")
display(df.head())
# Aggregate on a no-, space latin name
df.rename(columns={"name": "orig_name"}, inplace=True)
df["synonyms"] = ""
df["agg"] = ""
for _, row in df.iterrows():
n = row["orig_name"]
# remove ' ' and ' ' of the name
syns = set([n, n.replace(" ", "").replace("-", "").replace("‐", "")])
if "-" in n:
syns.update([n.replace("-", " "), n.replace("-", "")])
if " " in n:
syns.update([n.replace(" ", "-"), n.replace(" ", "")])
# convert greek symbols to latin and letters
greeks = (
set([greek2letter(i) for i in syns])
.union(set([greek2latin(i) for i in syns]))
.union(set([letter2latin(i) for i in syns]))
)
# if contains greek symbols, the name must be latin
if greek2latin(n) != n:
row["agg"] = min(set([greek2latin(i) for i in syns]), key=len)
else:
# use the shortest syns as the name
row["agg"] = min(syns.union(greeks), key=len)
row["agg"] = row["agg"].upper()
row["synonyms"] = list(syns.union(greeks))
# aggregate all synonyms
df_group = df.groupby("agg").agg(
{
"ncbi_gene_id": "first",
"gene_symbol": "first",
"uniprotkb_id": "first",
"synonyms": "sum",
"orig_name": list,
}
)
# remove the synonyms that are only case different from name
# use the shortest non-greek original name as the new name
df_group["name"] = ""
for _, row in df_group.iterrows():
orig_names = set(row["orig_name"])
shortest_orig_name = min(orig_names, key=len)
if greek2latin(shortest_orig_name) != shortest_orig_name:
shortest_orig_name = greek2latin(shortest_orig_name)
else:
orig_names.remove(shortest_orig_name)
syns = orig_names.union(row["synonyms"])
syns = {i for i in syns if i.lower() != shortest_orig_name.lower()}
syns = remove_case_insensitive_dup(syns)
row["synonyms"] = "|".join(syns)
row["name"] = shortest_orig_name
df_group.reset_index(drop=True, inplace=True)
df_group = df_group[
["name", "synonyms", "gene_symbol", "ncbi_gene_id", "uniprotkb_id"]
]
df_group = df_group.sort_values("gene_symbol")
df_group = df_group.set_index("name")
print(f"Final shape: {df_group.shape}")
display(df_group.head())
return df_group
Human¶
df_human = preprocess(df_cm, species="human")
Original shape: (60877, 20)
species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cell_name | cellontology_id | marker | Symbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | MERTK | MERTK | 10461 | protein_coding | MER proto-oncogene, tyrosine kinase | Q12866 | None | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
1 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD16 | FCGR3A | 2215 | protein_coding | Fc fragment of IgG receptor IIIb | O75015 | None | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
2 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD206 | MRC1 | 4360 | protein_coding | mannose receptor C-type 1 | P22897 | None | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
3 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CRIg | VSIG4 | 11326 | protein_coding | V-set and immunoglobulin domain containing 4 | Q9Y279 | None | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
4 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD163 | CD163 | 9332 | protein_coding | CD163 molecule | Q86VB7 | None | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
Unique shape: (16679, 4)
name | ncbi_gene_id | gene_symbol | uniprotkb_id | |
---|---|---|---|---|
0 | MERTK | 10461 | MERTK | Q12866 |
1 | CD16 | 2215 | FCGR3A | O75015 |
2 | CD206 | 4360 | MRC1 | P22897 |
3 | CRIg | 11326 | VSIG4 | Q9Y279 |
4 | CD163 | 9332 | CD163 | Q86VB7 |
Final shape: (15466, 4)
synonyms | gene_symbol | ncbi_gene_id | uniprotkb_id | |
---|---|---|---|---|
name | ||||
A1BG | A1BG | 1 | P04217 | |
A2M | A2M | 3494 | None | |
A2ML1 | A2ML1 | 144568 | A8K2U0 | |
A4GALT | A4GALT | 53947 | A0A0S2Z5J1 | |
AADAC | AADAC | 13 | P22760 |
# confirm that no synonym is attached to multiple entries
exp = pd.DataFrame(df_human["synonyms"].str.split("|"))
exp = exp.explode("synonyms")
exp = exp[exp["synonyms"].apply(len) > 0]
exp[exp["synonyms"].duplicated(keep=False)]
synonyms | |
---|---|
name |
df_human.to_parquet("df_human__cellmarker__2.0__CellMarker.parquet")
Mouse¶
df_mouse = preprocess(df_cm, species="mouse")
Original shape: (35197, 20)
species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cell_name | cellontology_id | marker | Symbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Fibro-adipogenic progenitor cell | NaN | Wisp1 | Ccn4 | 22402 | protein_coding | cellular communication network factor 4 | O54775 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
9 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myoblast | CL_0000056 | Myod1 | Myod1 | 17927 | protein_coding | myogenic differentiation 1 | P10085 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
10 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Muscle satellite cell | CL_0000514 | Myf5 | Myf5 | 17877 | protein_coding | myogenic factor 5 | A2RSK4 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
11 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myocyte | CL_0000187 | Ckm | Ckm | 12715 | protein_coding | creatine kinase, muscle | A2RTA0 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
12 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myocyte | CL_0000187 | Acta1 | Acta1 | 11459 | protein_coding | actin alpha 1, skeletal muscle | P68134 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
Unique shape: (12503, 4)
name | ncbi_gene_id | gene_symbol | uniprotkb_id | |
---|---|---|---|---|
8 | Wisp1 | 22402 | Ccn4 | O54775 |
9 | Myod1 | 17927 | Myod1 | P10085 |
10 | Myf5 | 17877 | Myf5 | A2RSK4 |
11 | Ckm | 12715 | Ckm | A2RTA0 |
12 | Acta1 | 11459 | Acta1 | P68134 |
Final shape: (11206, 4)
synonyms | gene_symbol | ncbi_gene_id | uniprotkb_id | |
---|---|---|---|---|
name | ||||
0610010K14Rik | 0610010K14Rik | 104457 | D3Z687 | |
0610030E20Rik | 0610030E20Rik | 68364 | Q149G0 | |
0610040J01Rik | 0610040J01Rik | 76261 | Q99K99 | |
1110017D15Rik | 1110017D15Rik | 73721 | Q2MH31 | |
1110032A03Rik | 1110032A03Rik | 68721 | Q9D131 |
df_mouse.to_parquet("df_mouse__cellmarker__2.0__CellMarker.parquet")