Gene: ensembl, release-111

  • https://www.ensembl.org/info/data/mysql.html

  • https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

from bionty_base.entities._gene import EnsemblGene

version = "release-111"

Human

ensembl_gene = EnsemblGene(organism="human", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 6031
✅ downloaded Gene table containing 76062 entries.
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSG00000000003 TSPAN6 7105 protein_coding tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] TM4SF6|TSPAN-6|T245
1 ENSG00000000005 TNMD 64102 protein_coding tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] MYODULIN|BRICD4|TEM|CHM1L|TENDIN
2 ENSG00000000419 DPM1 8813 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
3 ENSG00000000457 SCYL3 57147 protein_coding SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... PACE-1|PACE1
4 ENSG00000000460 FIRRM 55732 protein_coding FIGNL1 interacting regulator of recombination ... FLIP|C1ORF112|FLJ10706|APOLO1|MEICA1
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSG00000000003 TSPAN6 7105 protein_coding tetraspanin 6 TM4SF6|TSPAN-6|T245
1 ENSG00000000005 TNMD 64102 protein_coding tenomodulin MYODULIN|BRICD4|TEM|CHM1L|TENDIN
2 ENSG00000000419 DPM1 8813 protein_coding dolichyl-phosphate mannosyltransferase subunit... CDGIE|MPDS
3 ENSG00000000457 SCYL3 57147 protein_coding SCY1 like pseudokinase 3 PACE-1|PACE1
4 ENSG00000000460 FIRRM 55732 protein_coding FIGNL1 interacting regulator of recombination ... FLIP|C1ORF112|FLJ10706|APOLO1|MEICA1
df.to_parquet(f"df_human__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df)
df_legacy.shape
(8285, 14)
df_legacy.to_parquet(f"df-legacy_human__ensembl__{version}__Gene.parquet")

Mouse

ensembl_gene = EnsemblGene(organism="mouse", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 600
✅ downloaded Gene table containing 57545 entries.
df.head()
ensembl_gene_id symbol ncbi_gene_id biotype description synonyms
0 ENSMUSG00000000001 Gnai3 14679 protein_coding guanine nucleotide binding protein (G protein)... Galphai3
1 ENSMUSG00000000003 Pbsn 54192 protein_coding probasin [Source:MGI Symbol;Acc:MGI:1860484] PB
2 ENSMUSG00000000028 Cdc45 12544 protein_coding cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l
3 ENSMUSG00000000031 H19 14955 lncRNA H19, imprinted maternally expressed transcript...
4 ENSMUSG00000000037 Scml2 107815 protein_coding Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.to_parquet(f"df_mouse__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df)
df_legacy.shape
(30685, 14)
df_legacy.head()
mapping_session_id old_stable_id old_version new_stable_id new_version type score old_db_name new_db_name old_release new_release old_assembly new_assembly created
0 59 ENSMUSG00000079169 4 ENSMUSG00000027157 12 gene 0.990000 mus_musculus_core_106_39 mus_musculus_core_107_39 106 107 GRCm39 GRCm39 2022-01-12 20:12:50
1 61 ENSMUSG00000095464 2 ENSMUSG00000046516 12 gene 0.990000 mus_musculus_core_108_39 mus_musculus_core_109_39 108 109 GRCm39 GRCm39 2022-08-25 23:23:22
2 59 ENSMUSG00000085431 8 ENSMUSG00000054510 7 gene 0.990000 mus_musculus_core_106_39 mus_musculus_core_107_39 106 107 GRCm39 GRCm39 2022-01-12 20:12:50
3 6 ENSMUSG00000067056 1 ENSMUSG00000070605 1 gene 0.993523 mus_musculus_core_36_34d mus_musculus_core_38_35 36 38 NCBIM34 NCBIM35 2006-03-15 17:41:36
4 6 ENSMUSG00000068846 1 ENSMUSG00000071738 1 gene 0.991369 mus_musculus_core_36_34d mus_musculus_core_38_35 36 38 NCBIM34 NCBIM35 2006-03-15 17:41:36
df_legacy.to_parquet(f"df-legacy_mouse__ensembl__{version}__Gene.parquet")

saccharomyces_cerevisiae

ensembl_gene = EnsemblGene(organism="saccharomyces cerevisiae", version=version)
df = ensembl_gene.download_df()
💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 218
❗ no ensembl_gene_id found, writing to table_id column.
✅ downloaded Gene table containing 7248 entries.
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.to_parquet(f"df_saccharomyces cerevisiae__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df, col="stable_id")
df_legacy.shape
(0, 14)