`Protein`: uniprot, 2023-03¶

import pandas as pd

def _get_shortest_name(df: pd.DataFrame, column: str, new_column="name"):
    """Get a single shortest name from a column of lists.

    Everyone else became synonyms.
    """
    synonyms_list = []
    name_list = []
    for lst in df[column]:

        def shortest_name(lst: list):
            return min(lst, key=len)

        synonyms = set(lst.split(", "))
        no_space_names = [i for i in synonyms if " " not in i]
        if len(no_space_names) == 0:
            name = shortest_name(synonyms)
        else:
            name = shortest_name(no_space_names)
        name_list.append(name)
        synonyms.remove(name)
        synonyms_list.append("|".join([i for i in synonyms]))

    df[new_column] = name_list
    df[column] = synonyms_list

Files are downloaded from: https://www.uniprot.org/uniprotkb

# Downloaded from 2022-09-26

filepaths = {
    "human": "https://bionty-assets.s3.amazonaws.com/uniprot-human-2023-03.tsv.gz",
    "mouse": "https://bionty-assets.s3.amazonaws.com/uniprot-mouse-2023-03.tsv.gz",
}

Human¶

Curate the tables¶

for species, filepath in filepaths.items():
    print(f"Loading {species} data...")

    df = pd.read_csv(filepath, sep="\t")

    print(f"shape: {df.shape}")
    display(df.head())

    df = df.rename(
        columns={
            "Entry": "uniprotkb_id",
            "Protein names": "synonyms",
            "Length": "length",
            "Gene Names (primary)": "gene_symbol",
            "Ensembl": "ensembl_gene_ids",
        }
    )

    # concatenate ncbi gene ids with |
    df["ensembl_gene_ids"] = df["ensembl_gene_ids"].fillna("")
    df["ensembl_gene_ids"] = (
        df["ensembl_gene_ids"].str.rstrip(";").str.replace(";", "|")
    )

    # pick the shortest name from synonyms as name
    # concatenate the rest synonyms with |
    df["synonyms"] = df["synonyms"].fillna("")
    _get_shortest_name(df, "synonyms")
    df = df[
        [
            "uniprotkb_id",
            "name",
            "length",
            "synonyms",
            "gene_symbol",
            "ensembl_gene_ids",
        ]
    ]

    # sort by uniprotkb id, reset index
    df = df[~df["uniprotkb_id"].isnull()]
    df = df.sort_values("uniprotkb_id").reset_index(drop=True)

    print(f"shape: {df.shape}, unique: {df.uniprotkb_id.is_unique}")
    display(df.head())

    filename = f"df_{species}__uniprot__2023-03__Protein.parquet"
    df.to_parquet(filename)

    print(f"Wrote {filename}.")
    print("------------------------------------------------")

Loading human data...
shape: (207892, 5)

	Entry	Protein names	Length	Gene Names (primary)	Ensembl
0	A0A024QZ08	Intraflagellar transport 20 homolog (Chlamydom...	132	IFT20	NaN
1	A0A024QZ26	Histone deacetylase 6, isoform CRA_c	1215	HDAC6	NaN
2	A0A024QZ86	T-box 2, isoform CRA_a	712	TBX2	NaN
3	A0A024QZA8	receptor protein-tyrosine kinase (EC 2.7.10.1)	976	EPHA2	NaN
4	A0A024QZB8	Battenin	438	CLN3	NaN

shape: (207892, 6), unique: True

	uniprotkb_id	name	length	synonyms	gene_symbol	ensembl_gene_ids
0	A0A023HJ61	HRES-1/RAB4 variant	121		RAB4A
1	A0A023HN28	SRSF3/USP6 fusion protein	16		NaN
2	A0A023I7F4	Cytochrome b	380		CYTB
3	A0A023I7H2	NADH-ubiquinone oxidoreductase chain 5 (EC 7.1...	603		ND5
4	A0A023I7H5	ATP synthase subunit a	226		ATP6

Wrote df_human__uniprot__2023-03__Protein.parquet.
------------------------------------------------
Loading mouse data...
shape: (86886, 5)

	Entry	Protein names	Length	Gene Names (primary)	Ensembl
0	A0A075F5C6	Heat shock factor 1 (Heat shock transcription ...	531	Hsf1	ENSMUST00000228371.2;
1	A0A087WPF7	Autism susceptibility gene 2 protein homolog	1261	Auts2	ENSMUST00000161226 [A0A087WPF7-1];ENSMUST00000...
2	A0A087WPU4	FAT atypical cadherin 1	159	Fat1	ENSMUST00000186342.3;
3	A0A087WRK1	Predicted gene, 20814 (Predicted gene, 20855) ...	222	Gm20905	ENSMUST00000185240.2;ENSMUST00000185245.2;ENSM...
4	A0A087WRT4	FAT atypical cadherin 1	4602	Fat1	ENSMUST00000189017.8;

shape: (86886, 6), unique: True

	uniprotkb_id	name	length	synonyms	gene_symbol	ensembl_gene_ids
0	A0A023JDV8	Creatine transporter SLC6A8 variant D	224		Slc6a8
1	A0A023NCR8	Cytochrome b (Complex III subunit 3) (Complex ...	233		cytB
2	A0A023NCS0	Cytochrome b (Complex III subunit 3) (Complex ...	222		cytB
3	A0A023ND59	Cytochrome b (Complex III subunit 3) (Complex ...	227		cytB
4	A0A023NDP0	Cytochrome b (Complex III subunit 3) (Complex ...	242		cytB

Wrote df_mouse__uniprot__2023-03__Protein.parquet.
------------------------------------------------

previous

CellMarker: cellmarker; 2.0

next

Protein: uniprot, 2023-02