Protein: uniprot, 2023-03

import pandas as pd
def _get_shortest_name(df: pd.DataFrame, column: str, new_column="name"):
    """Get a single shortest name from a column of lists.

    Everyone else became synonyms.
    """
    synonyms_list = []
    name_list = []
    for lst in df[column]:

        def shortest_name(lst: list):
            return min(lst, key=len)

        synonyms = set(lst.split(", "))
        no_space_names = [i for i in synonyms if " " not in i]
        if len(no_space_names) == 0:
            name = shortest_name(synonyms)
        else:
            name = shortest_name(no_space_names)
        name_list.append(name)
        synonyms.remove(name)
        synonyms_list.append("|".join([i for i in synonyms]))

    df[new_column] = name_list
    df[column] = synonyms_list

Files are downloaded from: https://www.uniprot.org/uniprotkb

# Downloaded from 2022-09-26

filepaths = {
    "human": "https://bionty-assets.s3.amazonaws.com/uniprot-human-2023-03.tsv.gz",
    "mouse": "https://bionty-assets.s3.amazonaws.com/uniprot-mouse-2023-03.tsv.gz",
}

Human

Curate the tables

for species, filepath in filepaths.items():
    print(f"Loading {species} data...")

    df = pd.read_csv(filepath, sep="\t")

    print(f"shape: {df.shape}")
    display(df.head())

    df = df.rename(
        columns={
            "Entry": "uniprotkb_id",
            "Protein names": "synonyms",
            "Length": "length",
            "Gene Names (primary)": "gene_symbol",
            "Ensembl": "ensembl_gene_ids",
        }
    )

    # concatenate ncbi gene ids with |
    df["ensembl_gene_ids"] = df["ensembl_gene_ids"].fillna("")
    df["ensembl_gene_ids"] = (
        df["ensembl_gene_ids"].str.rstrip(";").str.replace(";", "|")
    )

    # pick the shortest name from synonyms as name
    # concatenate the rest synonyms with |
    df["synonyms"] = df["synonyms"].fillna("")
    _get_shortest_name(df, "synonyms")
    df = df[
        [
            "uniprotkb_id",
            "name",
            "length",
            "synonyms",
            "gene_symbol",
            "ensembl_gene_ids",
        ]
    ]

    # sort by uniprotkb id, reset index
    df = df[~df["uniprotkb_id"].isnull()]
    df = df.sort_values("uniprotkb_id").reset_index(drop=True)

    print(f"shape: {df.shape}, unique: {df.uniprotkb_id.is_unique}")
    display(df.head())

    filename = f"df_{species}__uniprot__2023-03__Protein.parquet"
    df.to_parquet(filename)

    print(f"Wrote {filename}.")
    print("------------------------------------------------")
Loading human data...
shape: (207892, 5)
Entry Protein names Length Gene Names (primary) Ensembl
0 A0A024QZ08 Intraflagellar transport 20 homolog (Chlamydom... 132 IFT20 NaN
1 A0A024QZ26 Histone deacetylase 6, isoform CRA_c 1215 HDAC6 NaN
2 A0A024QZ86 T-box 2, isoform CRA_a 712 TBX2 NaN
3 A0A024QZA8 receptor protein-tyrosine kinase (EC 2.7.10.1) 976 EPHA2 NaN
4 A0A024QZB8 Battenin 438 CLN3 NaN
shape: (207892, 6), unique: True
uniprotkb_id name length synonyms gene_symbol ensembl_gene_ids
0 A0A023HJ61 HRES-1/RAB4 variant 121 RAB4A
1 A0A023HN28 SRSF3/USP6 fusion protein 16 NaN
2 A0A023I7F4 Cytochrome b 380 CYTB
3 A0A023I7H2 NADH-ubiquinone oxidoreductase chain 5 (EC 7.1... 603 ND5
4 A0A023I7H5 ATP synthase subunit a 226 ATP6
Wrote df_human__uniprot__2023-03__Protein.parquet.
------------------------------------------------
Loading mouse data...
shape: (86886, 5)
Entry Protein names Length Gene Names (primary) Ensembl
0 A0A075F5C6 Heat shock factor 1 (Heat shock transcription ... 531 Hsf1 ENSMUST00000228371.2;
1 A0A087WPF7 Autism susceptibility gene 2 protein homolog 1261 Auts2 ENSMUST00000161226 [A0A087WPF7-1];ENSMUST00000...
2 A0A087WPU4 FAT atypical cadherin 1 159 Fat1 ENSMUST00000186342.3;
3 A0A087WRK1 Predicted gene, 20814 (Predicted gene, 20855) ... 222 Gm20905 ENSMUST00000185240.2;ENSMUST00000185245.2;ENSM...
4 A0A087WRT4 FAT atypical cadherin 1 4602 Fat1 ENSMUST00000189017.8;
shape: (86886, 6), unique: True
uniprotkb_id name length synonyms gene_symbol ensembl_gene_ids
0 A0A023JDV8 Creatine transporter SLC6A8 variant D 224 Slc6a8
1 A0A023NCR8 Cytochrome b (Complex III subunit 3) (Complex ... 233 cytB
2 A0A023NCS0 Cytochrome b (Complex III subunit 3) (Complex ... 222 cytB
3 A0A023ND59 Cytochrome b (Complex III subunit 3) (Complex ... 227 cytB
4 A0A023NDP0 Cytochrome b (Complex III subunit 3) (Complex ... 242 cytB
Wrote df_mouse__uniprot__2023-03__Protein.parquet.
------------------------------------------------