Gene: ensembl, release-109

  • https://www.ensembl.org/info/data/mysql.html

  • https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as sql  # needed
def get_url(db="homo_sapiens_core_109_38"):
    return f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{db}"

Human

engine = create_engine(url=get_url())

Queries

query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('HGNC', 'EntrezGene')
"""

Query for the basic gene annotations:

results_core = pd.read_sql(query_core, con=engine)
results_core.shape
(111838, 5)
results_core.head()
stable_id display_label biotype description synonym
0 ENSG00000210049 MT-TF Mt_tRNA mitochondrially encoded tRNA-Phe (UUU/C) [Sour... MTTF
1 ENSG00000210049 MT-TF Mt_tRNA mitochondrially encoded tRNA-Phe (UUU/C) [Sour... trnF
2 ENSG00000211459 MT-RNR1 Mt_rRNA mitochondrially encoded 12S rRNA [Source:HGNC ... 12S
3 ENSG00000211459 MT-RNR1 Mt_rRNA mitochondrially encoded 12S rRNA [Source:HGNC ... MOTS-c
4 ENSG00000211459 MT-RNR1 Mt_rRNA mitochondrially encoded 12S rRNA [Source:HGNC ... MTRNR1
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)
results_core_group.head()
display_label biotype description synonym
stable_id
ENSG00000000003 TSPAN6 protein_coding tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] TSPAN-6|T245|TM4SF6
ENSG00000000005 TNMD protein_coding tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] tendin|ChM1L|TEM|myodulin|BRICD4
ENSG00000000419 DPM1 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
ENSG00000000457 SCYL3 protein_coding SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... PACE1|PACE-1
ENSG00000000460 C1orf112 protein_coding chromosome 1 open reading frame 112 [Source:HG... FLJ10706
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]
results_core_group.shape
(69299, 4)

Query for external ids:

results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape
(80857, 4)
results_external.head(2)
stable_id xref_id dbprimary_acc db_name
0 ENSG00000210049 2898423 HGNC:7481 HGNC
1 ENSG00000211459 2898394 HGNC:7470 HGNC

HGNC

hgnc = (
    results_external[results_external["db_name"] == "HGNC"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
hgnc.rename(columns={"dbprimary_acc": "hgnc_id"}, inplace=True)
dup = hgnc[hgnc.stable_id.duplicated(keep=False)]
dup
stable_id hgnc_id
6193 ENSG00000277796 HGNC:10628
6195 ENSG00000277796 HGNC:30554
6366 ENSG00000277768 HGNC:10628
6368 ENSG00000277768 HGNC:30554
9921 ENSG00000277336 HGNC:10628
9923 ENSG00000277336 HGNC:30554
12519 ENSG00000288487 HGNC:16346
12520 ENSG00000288487 HGNC:6335
17707 ENSG00000230417 HGNC:31430
17708 ENSG00000230417 HGNC:45111
79713 ENSG00000276085 HGNC:10628
79715 ENSG00000276085 HGNC:30554
# resolved by searching on https://www.genenames.org

cond1 = (hgnc.stable_id == "ENSG00000277796") & (hgnc.hgnc_id == "HGNC:10628")
cond2 = (hgnc.stable_id == "ENSG00000276085") & (hgnc.hgnc_id == "HGNC:30554")
# ENSG00000230417 is mapped to both HGNC:31430 and HGNC:45111
cond3 = hgnc.stable_id == "ENSG00000230417"

hgnc = hgnc[~(cond1 | cond2 | cond3)].set_index("stable_id")
hgnc[hgnc.index.duplicated()]
hgnc_id
stable_id
ENSG00000277768 HGNC:30554
ENSG00000277336 HGNC:30554
ENSG00000288487 HGNC:6335

Entrez

entrez = (
    results_external[results_external["db_name"] == "EntrezGene"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
entrez = entrez.set_index("stable_id")
dup = entrez[entrez.index.duplicated(keep=False)]
dup
ncbi_gene_id
stable_id
ENSG00000278294 124907156
ENSG00000278294 124907485
ENSG00000278294 124908250
ENSG00000276779 3805
ENSG00000276779 124900568
... ...
ENSG00000273768 124905574
ENSG00000273768 124905808
ENSG00000273768 124905809
ENSG00000178104 9659
ENSG00000178104 124904395

6437 rows × 1 columns

Merge Ensembl with HGNC and Entrez

results_core_group
display_label biotype description synonym
stable_id
ENSG00000000003 TSPAN6 protein_coding tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] TSPAN-6|T245|TM4SF6
ENSG00000000005 TNMD protein_coding tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] tendin|ChM1L|TEM|myodulin|BRICD4
ENSG00000000419 DPM1 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
ENSG00000000457 SCYL3 protein_coding SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... PACE1|PACE-1
ENSG00000000460 C1orf112 protein_coding chromosome 1 open reading frame 112 [Source:HG... FLJ10706
... ... ... ... ...
ENSG00000291313 None protein_coding novel protein
ENSG00000291314 None protein_coding novel protein
ENSG00000291315 None protein_coding novel protein
ENSG00000291316 None protein_coding novel protein, LOC84773-CYHR1 readthrough
ENSG00000291317 TMEM276 protein_coding transmembrane protein 276 [Source:HGNC Symbol;...

69299 rows × 4 columns

df = results_core_group.merge(hgnc, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]
display_label biotype description synonym hgnc_id
stable_id
ENSG00000277336 CCL3L3 protein_coding C-C motif chemokine ligand 3 like 3 [Source:HG... MGC12815 HGNC:30554
ENSG00000277768 CCL3L3 protein_coding C-C motif chemokine ligand 3 like 3 [Source:HG... MGC12815 HGNC:30554
ENSG00000288487 KIR2DS3 protein_coding killer cell immunoglobulin like receptor, two ... nkat7 HGNC:6335
df = df.merge(entrez, left_index=True, right_index=True, how="outer")
df = df.reset_index()
df.rename(
    columns={
        "stable_id": "ensembl_gene_id",
        "display_label": "symbol",
        "synonym": "synonyms",
    },
    inplace=True,
)
df = df[
    [
        "ensembl_gene_id",
        "symbol",
        "ncbi_gene_id",
        "hgnc_id",
        "biotype",
        "description",
        "synonyms",
    ]
]
df = df[~df["ensembl_gene_id"].isnull()]
df = df.sort_values("ensembl_gene_id").reset_index(drop=True)
df.head()
ensembl_gene_id symbol ncbi_gene_id hgnc_id biotype description synonyms
0 ENSG00000000003 TSPAN6 7105 HGNC:11858 protein_coding tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] TSPAN-6|T245|TM4SF6
1 ENSG00000000005 TNMD 64102 HGNC:17757 protein_coding tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] tendin|ChM1L|TEM|myodulin|BRICD4
2 ENSG00000000419 DPM1 8813 HGNC:3005 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
3 ENSG00000000457 SCYL3 57147 HGNC:19285 protein_coding SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... PACE1|PACE-1
4 ENSG00000000460 C1orf112 55732 HGNC:25565 protein_coding chromosome 1 open reading frame 112 [Source:HG... FLJ10706
df.shape
(75124, 7)
df.to_parquet("df_human__ensembl__release-109__Gene.parquet")

Uploaded to: s3://bionty-assets/human_ensembl_release-109_Gene_lookup.parquet

Mouse

engine = create_engine(url=get_url("mus_musculus_core_109_39"))

Queries

query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('MGI', "EntrezGene")
"""

Query for the basic gene annotations:

results_core = pd.read_sql(query_core, con=engine)
results_core.shape
(84720, 5)
results_core.head()
stable_id display_label biotype description synonym
0 ENSMUSG00000064336 mt-Tf Mt_tRNA mitochondrially encoded tRNA phenylalanine [So... tRNA
1 ENSMUSG00000064336 mt-Tf Mt_tRNA mitochondrially encoded tRNA phenylalanine [So... tRNA-Phe
2 ENSMUSG00000064336 mt-Tf Mt_tRNA mitochondrially encoded tRNA phenylalanine [So... TrnF tRNA
3 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA mitochondrially encoded 12S rRNA [Source:MGI S... 12S ribosomal RNA
4 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA mitochondrially encoded 12S rRNA [Source:MGI S... 12S rRNA
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)
results_core_group.head()
display_label biotype description synonym
stable_id
ENSMUSG00000000001 Gnai3 protein_coding guanine nucleotide binding protein (G protein)... Galphai3
ENSMUSG00000000003 Pbsn protein_coding probasin [Source:MGI Symbol;Acc:MGI:1860484] PB
ENSMUSG00000000028 Cdc45 protein_coding cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l
ENSMUSG00000000031 H19 lncRNA H19, imprinted maternally expressed transcript...
ENSMUSG00000000037 Scml2 protein_coding Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]
results_core_group.shape
(57010, 4)

Query for external ids:

results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape
(83066, 4)
results_external.head(2)
stable_id xref_id dbprimary_acc db_name
0 ENSMUSG00000064336 1630742 MGI:102487 MGI
1 ENSMUSG00000064337 1630726 MGI:102493 MGI

MGI

mgi = (
    results_external[results_external["db_name"] == "MGI"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
mgi.rename(columns={"dbprimary_acc": "mgi_id"}, inplace=True)
mgi = mgi.set_index("stable_id")
dup = mgi[mgi.index.duplicated(keep=False)]
dup
mgi_id
stable_id
ENSMUSG00000115016 MGI:2145569
ENSMUSG00000115016 MGI:5593065
ENSMUSG00000119828 MGI:5455181
ENSMUSG00000119828 MGI:6721448
ENSMUSG00000082414 MGI:3705775
ENSMUSG00000082414 MGI:5434448

Entrez

entrez = (
    results_external[results_external["db_name"] == "EntrezGene"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
entrez = entrez.set_index("stable_id")
dup = entrez[entrez.index.duplicated(keep=False)]
dup
ncbi_gene_id
stable_id
ENSMUSG00000094741 331195
ENSMUSG00000094741 100503733
ENSMUSG00000094383 108168683
ENSMUSG00000094383 108168684
ENSMUSG00000094383 108169098
... ...
ENSMUSG00000095545 102639505
ENSMUSG00000089756 667962
ENSMUSG00000089756 102639505
ENSMUSG00000078862 628147
ENSMUSG00000078862 665211

622 rows × 1 columns

Merge ensembl with MGI, Entrez

results_core_group
display_label biotype description synonym
stable_id
ENSMUSG00000000001 Gnai3 protein_coding guanine nucleotide binding protein (G protein)... Galphai3
ENSMUSG00000000003 Pbsn protein_coding probasin [Source:MGI Symbol;Acc:MGI:1860484] PB
ENSMUSG00000000028 Cdc45 protein_coding cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l
ENSMUSG00000000031 H19 lncRNA H19, imprinted maternally expressed transcript...
ENSMUSG00000000037 Scml2 protein_coding Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik
... ... ... ... ...
ENSMUSG00002076988 Gm56371 rRNA predicted gene, 56371 [Source:MGI Symbol;Acc:M...
ENSMUSG00002076989 Gm23510 snRNA predicted gene, 23510 [Source:MGI Symbol;Acc:M...
ENSMUSG00002076990 Gm22711 snoRNA predicted gene, 22711 [Source:MGI Symbol;Acc:M...
ENSMUSG00002076991 Gm55627 misc_RNA predicted gene, 55627 [Source:MGI Symbol;Acc:M...
ENSMUSG00002076992 Gm54807 misc_RNA predicted gene, 54807 [Source:MGI Symbol;Acc:M...

57010 rows × 4 columns

df = results_core_group.merge(mgi, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]
display_label biotype description synonym mgi_id
stable_id
ENSMUSG00000082414 Gm13303 unprocessed_pseudogene predicted gene 13303 [Source:MGI Symbol;Acc:MG... MGI:5434448
ENSMUSG00000115016 Gm33906 lncRNA predicted gene, 33906 [Source:MGI Symbol;Acc:M... MGI:5593065
ENSMUSG00000119828 Gm25404 snRNA predicted gene, 25404 [Source:MGI Symbol;Acc:M... MGI:6721448
df = df.merge(entrez, left_index=True, right_index=True, how="outer")
df = df.reset_index()
df.rename(
    columns={
        "stable_id": "ensembl_gene_id",
        "display_label": "symbol",
        "synonym": "synonyms",
    },
    inplace=True,
)
df = df[
    [
        "ensembl_gene_id",
        "symbol",
        "ncbi_gene_id",
        "mgi_id",
        "biotype",
        "description",
        "synonyms",
    ]
]

df = df[~df["ensembl_gene_id"].isnull()]
df = df.sort_values("ensembl_gene_id").reset_index(drop=True)
df.head()
ensembl_gene_id symbol ncbi_gene_id mgi_id biotype description synonyms
0 ENSMUSG00000000001 Gnai3 14679 MGI:95773 protein_coding guanine nucleotide binding protein (G protein)... Galphai3
1 ENSMUSG00000000003 Pbsn 54192 MGI:1860484 protein_coding probasin [Source:MGI Symbol;Acc:MGI:1860484] PB
2 ENSMUSG00000000028 Cdc45 12544 MGI:1338073 protein_coding cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l
3 ENSMUSG00000000031 H19 14955 MGI:95891 lncRNA H19, imprinted maternally expressed transcript...
4 ENSMUSG00000000037 Scml2 107815 MGI:1340042 protein_coding Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik
df.shape
(57388, 7)
df.to_parquet("df_mouse__ensembl__release-109__Gene.parquet")