Ensembl MySQL

  • https://www.ensembl.org/info/data/mysql.html

  • https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

import mysql.connector as sql
import pandas as pd
from sqlalchemy import create_engine

core db

def get_url(db="homo_sapiens_core_109_38"):
    return f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{db}"
engine = create_engine(url=get_url())

Pull the full gene table:

gene = pd.read_sql("SELECT * FROM gene", con=engine)
gene.shape
(70616, 16)
gene.head(2)
gene_id biotype analysis_id seq_region_id seq_region_start seq_region_end seq_region_strand display_xref_id source description is_current canonical_transcript_id stable_id version created_date modified_date
0 554 Mt_tRNA 1 132907 577 647 1 2873366.0 insdc mitochondrially encoded tRNA-Phe (UUU/C) [Sour... 1 1587 ENSG00000210049 1 2006-05-12 2006-05-12 00:00:00
1 555 Mt_rRNA 1 132907 648 1601 1 2873345.0 insdc mitochondrially encoded 12S rRNA [Source:HGNC ... 1 1588 ENSG00000211459 2 2006-07-03 2010-01-08 18:00:56

Pull the xref table:

xref = pd.read_sql("SELECT * FROM xref", con=engine)
xref.shape
(2330490, 8)
xref.head(2)
xref_id external_db_id dbprimary_acc display_label version description info_type info_text
0 1 2700 ENSG00000108821 ENSG00000108821 0 None NONE
1 2 50541 LRG_1 LRG_1 0 Locus Reference Genomic record for COL1A1 DIRECT

Pull the external_db table:

  • EntrezGene: 1300

  • HGNC: 1100

  • MGI: 1400

external_db = pd.read_sql("SELECT * FROM external_db", con=engine)
external_db.shape
(478, 10)
external_db[external_db["type"] == "PRIMARY_DB_SYNONYM"].head()
external_db_id db_name db_release status priority db_display_name type secondary_db_name secondary_db_table description
16 821 FlyBaseName_gene 1 KNOWN 5 FlyBase gene name PRIMARY_DB_SYNONYM None None None
19 826 FlyBaseCGID_gene 1 XREF 5 FlyBase gene CGID PRIMARY_DB_SYNONYM None None None
29 1100 HGNC 1 KNOWNXREF 100 HGNC Symbol PRIMARY_DB_SYNONYM None None None
31 1300 EntrezGene 1 KNOWNXREF 250 NCBI gene (formerly Entrezgene) PRIMARY_DB_SYNONYM None None None
32 1400 MGI 1 KNOWNXREF 270 MGI Symbol PRIMARY_DB_SYNONYM None None None

Pull the external_synonym table:

external_synonym = pd.read_sql("SELECT * FROM external_synonym", con=engine)
external_synonym.shape
(56200, 2)
external_synonym.head()
xref_id synonym
0 420056 A2MP
1 423501 ALOX12E
2 438805 CRYB2B
3 440287 bA144G6.6
4 440287 bA342C24.4

Pull dobject_xref genes

object_xref = pd.read_sql(
    "SELECT * FROM object_xref WHERE ensembl_object_type = 'Gene'", con=engine
)
object_xref.shape
(467563, 6)
object_xref.head()
object_xref_id ensembl_id ensembl_object_type xref_id linkage_annotation analysis_id
0 258269 554 Gene 315859 None 8381
1 627342 554 Gene 2873366 None 8381
2 258270 555 Gene 315863 None 8381
3 627336 555 Gene 2873345 None 8381
4 428421 555 Gene 399115 None 8381