Gene: ensembl, release-112

  • https://www.ensembl.org/info/data/mysql.html

  • https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

from bionty_base.entities._gene import EnsemblGene

version = "release-112"
✅ wrote new records from public sources.yaml to /home/zeth/.lamin/bionty/versions/sources_local.yaml!

if you see this message repeatedly, run: import bionty_base; bionty_base.reset_sources()

Human

ensembl_gene = EnsemblGene(organism="human", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 5904
✅ downloaded Gene table containing 75829 entries.
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSG00000000003 TSPAN6 7105 protein_coding tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] TSPAN-6|T245|TM4SF6
1 ENSG00000000005 TNMD 64102 protein_coding tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] TEM|MYODULIN|CHM1L|TENDIN|BRICD4
2 ENSG00000000419 DPM1 8813 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
3 ENSG00000000457 SCYL3 57147 protein_coding SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... PACE1|PACE-1
4 ENSG00000000460 FIRRM 55732 protein_coding FIGNL1 interacting regulator of recombination ... C1ORF112|FLJ10706|APOLO1|FLIP|MEICA1
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSG00000000003 TSPAN6 7105 protein_coding tetraspanin 6 TSPAN-6|T245|TM4SF6
1 ENSG00000000005 TNMD 64102 protein_coding tenomodulin TEM|MYODULIN|CHM1L|TENDIN|BRICD4
2 ENSG00000000419 DPM1 8813 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
3 ENSG00000000457 SCYL3 57147 protein_coding SCY1 like pseudokinase 3 PACE1|PACE-1
4 ENSG00000000460 FIRRM 55732 protein_coding FIGNL1 interacting regulator of recombination ... C1ORF112|FLJ10706|APOLO1|FLIP|MEICA1
df.to_parquet(f"df_human__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df)
df_legacy.shape
(8390, 14)
df_legacy.to_parquet(f"df-legacy_human__ensembl__{version}__Gene.parquet")

Mouse

ensembl_gene = EnsemblGene(organism="mouse", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 518
✅ downloaded Gene table containing 57510 entries.
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSMUSG00000000001 Gnai3 14679 protein_coding G protein subunit alpha i3 [Source:MGI Symbol;... Galphai3
1 ENSMUSG00000000003 Pbsn 54192 protein_coding probasin [Source:MGI Symbol;Acc:MGI:1860484] PB
2 ENSMUSG00000000028 Cdc45 12544 protein_coding cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l
3 ENSMUSG00000000031 H19 14955 lncRNA H19, imprinted maternally expressed transcript...
4 ENSMUSG00000000037 Scml2 107815 protein_coding Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.to_parquet(f"df_mouse__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df)
df_legacy.shape
(30688, 14)
df_legacy.head()
mapping_session_id old_stable_id old_version new_stable_id new_version type score old_db_name new_db_name old_release new_release old_assembly new_assembly created
0 64 ENSMUSG00000116029 3 ENSMUSG00000008658 18 gene 0.99 mus_musculus_core_111_39 mus_musculus_core_112_39 111 112 GRCm39 GRCm39 2023-09-12 19:29:03
1 64 ENSMUSG00000094083 3 ENSMUSG00000023809 12 gene 0.99 mus_musculus_core_111_39 mus_musculus_core_112_39 111 112 GRCm39 GRCm39 2023-09-12 19:29:03
2 59 ENSMUSG00000079169 4 ENSMUSG00000027157 12 gene 0.99 mus_musculus_core_106_39 mus_musculus_core_107_39 106 107 GRCm39 GRCm39 2022-01-12 20:12:50
3 61 ENSMUSG00000095464 2 ENSMUSG00000046516 12 gene 0.99 mus_musculus_core_108_39 mus_musculus_core_109_39 108 109 GRCm39 GRCm39 2022-08-25 23:23:22
4 64 ENSMUSG00000120670 1 ENSMUSG00000052516 21 gene 0.99 mus_musculus_core_111_39 mus_musculus_core_112_39 111 112 GRCm39 GRCm39 2023-09-12 19:29:03
df_legacy.to_parquet(f"df-legacy_mouse__ensembl__{version}__Gene.parquet")

saccharomyces_cerevisiae

ensembl_gene = EnsemblGene(organism="saccharomyces cerevisiae", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 218
❗ no ensembl_gene_id found, writing to table_id column.
✅ downloaded Gene table containing 7248 entries.
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.to_parquet(f"df_saccharomyces cerevisiae__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df, col="stable_id")
df_legacy.shape
(0, 14)