Exploring ensembl ftp

from collections import namedtuple
import pandas as pd
from bionty import Species
from nbproject import header

header()
id4rPX1NdzSZVS
versiondraft
time_init2022-09-26 09:32
time_run2022-09-26 09:32
pypackagenbproject==0.5.3 pandas==1.4.3
common_name = "human"

species = Species().df.loc["human"]
baseurl = f"http://ftp.ensembl.org/pub/current_tsv/{human.scientific_name}/"
checksums = baseurl + "CHECKSUMS"
version = pd.read_fwf(checksums, header=None).iloc[1][2].split(".")[2]

version
'107'
file_prefix = f"{species.scientific_name.capitalize()}.{species.assembly}.{version}."
file_prefix
'Homo_sapiens.GRCh38.107.'
filenames = dict(
    canonical="canonical.tsv",
    ena="ena.tsv.gz",
    entrez="entrez.tsv.gz",
    refseq="refseq.tsv.gz",
    uniprot="uniprot.tsv.gz",
)
filenames
{'canonical': 'canonical.tsv',
 'ena': 'ena.tsv.gz',
 'entrez': 'entrez.tsv.gz',
 'karyotype': 'karyotype.tsv.gz',
 'refseq': 'refseq.tsv.gz',
 'uniprot': 'uniprot.tsv.gz'}
canonical = pd.read_table(
    f"{baseurl}/{file_prefix}{filenames['canonical']}", header=None
)
canonical
0 1 2
0 ENSG00000210049.1 ENST00000387314.1 Ensembl Canonical
1 ENSG00000211459.2 ENST00000389680.2 Ensembl Canonical
2 ENSG00000210077.1 ENST00000387342.1 Ensembl Canonical
3 ENSG00000210082.2 ENST00000387347.2 Ensembl Canonical
4 ENSG00000209082.1 ENST00000386347.1 Ensembl Canonical
... ... ... ...
87439 ENSG00000168509.20 ENST00000336751.11 MANE Select
87440 ENSG00000196859.8 ENST00000355612.7 MANE Select
87441 ENSG00000250479.9 ENST00000484558.3 MANE Select
87442 ENSG00000164488.12 ENST00000366795.4 MANE Select
87443 ENSG00000187533.14 ENST00000344526.10 MANE Select

87444 rows × 3 columns

geneids = canonical[0].str.split(".", expand=True)
geneids[0].is_unique
False
len(geneids[0].unique())
68324
# the unique gene_stable_id in the ena table matches exactly the canonical table

ena = pd.read_table(f"{baseurl}/{file_prefix}{filenames['ena']}")
ena
species taxid gene_stable_id transcript_stable_id protein_stable_id primary_accession secondary_accession
0 Homo_sapiens 9606 ENSG00000000003 ENST00000373020 ENSP00000362111 CM000685 AAH12389
1 Homo_sapiens 9606 ENSG00000000003 ENST00000373020 ENSP00000362111 chrX AAH12389
2 Homo_sapiens 9606 ENSG00000000003 ENST00000373020 ENSP00000362111 NC_000023 AAH12389
3 Homo_sapiens 9606 ENSG00000000003 ENST00000612152 ENSP00000482130 CM000685 NaN
4 Homo_sapiens 9606 ENSG00000000003 ENST00000612152 ENSP00000482130 chrX NaN
... ... ... ... ... ... ... ...
775080 Homo_sapiens 9606 ENSG00000290165 ENST00000703415 NaN chrX NaN
775081 Homo_sapiens 9606 ENSG00000290165 ENST00000703415 NaN NC_000023 NaN
775082 Homo_sapiens 9606 ENSG00000290166 ENST00000702095 NaN CM000681 NaN
775083 Homo_sapiens 9606 ENSG00000290166 ENST00000702095 NaN chr19 NaN
775084 Homo_sapiens 9606 ENSG00000290166 ENST00000702095 NaN NC_000019 NaN

775085 rows × 7 columns

ena
68324
# only contains ensembl ids that have a mappable entrez id

entrez = pd.read_table(f"{baseurl}/{file_prefix}{filenames['entrez']}")
entrez
gene_stable_id transcript_stable_id protein_stable_id xref db_name info_type source_identity xref_identity linkage_type
0 ENSG00000160072 ENST00000673477 ENSP00000500094 83858 EntrezGene DEPENDENT - - -
1 ENSG00000160072 ENST00000472194 - 83858 EntrezGene DEPENDENT - - -
2 ENSG00000160072 ENST00000378736 - 83858 EntrezGene DEPENDENT - - -
3 ENSG00000160072 ENST00000485748 - 83858 EntrezGene DEPENDENT - - -
4 ENSG00000160072 ENST00000474481 - 83858 EntrezGene DEPENDENT - - -
... ... ... ... ... ... ... ... ... ...
223255 ENSG00000212907 ENST00000361335 ENSP00000354728 4539 EntrezGene DEPENDENT - - -
223256 ENSG00000198886 ENST00000361381 ENSP00000354961 4538 EntrezGene DEPENDENT - - -
223257 ENSG00000198786 ENST00000361567 ENSP00000354813 4540 EntrezGene DEPENDENT - - -
223258 ENSG00000198695 ENST00000361681 ENSP00000354665 4541 EntrezGene DEPENDENT - - -
223259 ENSG00000198727 ENST00000361789 ENSP00000354554 4519 EntrezGene DEPENDENT - - -

223260 rows × 9 columns

len(entrez["gene_stable_id"].unique())
28975
refseq = pd.read_table(f"{baseurl}/{file_prefix}{filenames['refseq']}")
refseq
gene_stable_id transcript_stable_id protein_stable_id xref db_name info_type source_identity xref_identity linkage_type
0 ENSG00000160072 ENST00000673477 ENSP00000500094 NP_001304167 RefSeq_peptide INFERRED_PAIR - - -
1 ENSG00000160072 ENST00000673477 ENSP00000500094 NP_114127 RefSeq_peptide DIRECT 100 100 -
2 ENSG00000160072 ENST00000673477 ENSP00000500094 NM_001317238 RefSeq_mRNA DIRECT 90 82 -
3 ENSG00000160072 ENST00000673477 ENSP00000500094 NM_031921 RefSeq_mRNA DIRECT 100 100 -
4 ENSG00000160072 ENST00000673477 ENSP00000500094 XM_005244806 RefSeq_mRNA_predicted DIRECT 45 92 -
... ... ... ... ... ... ... ... ... ...
263548 ENSG00000212907 ENST00000361335 ENSP00000354728 YP_003024034 RefSeq_peptide SEQUENCE_MATCH 100 100 -
263549 ENSG00000198886 ENST00000361381 ENSP00000354961 YP_003024035 RefSeq_peptide SEQUENCE_MATCH 100 100 -
263550 ENSG00000198786 ENST00000361567 ENSP00000354813 YP_003024036 RefSeq_peptide SEQUENCE_MATCH 100 100 -
263551 ENSG00000198695 ENST00000361681 ENSP00000354665 YP_003024037 RefSeq_peptide SEQUENCE_MATCH 100 100 -
263552 ENSG00000198727 ENST00000361789 ENSP00000354554 YP_003024038 RefSeq_peptide SEQUENCE_MATCH 100 100 -

263553 rows × 9 columns

uniprot = pd.read_table(f"{baseurl}/{file_prefix}{filenames['uniprot']}")
uniprot
gene_stable_id transcript_stable_id protein_stable_id xref db_name info_type source_identity xref_identity linkage_type
0 ENSG00000160072 ENST00000673477 ENSP00000500094 Q5T9A4 Uniprot/SWISSPROT DIRECT 100 100 -
1 ENSG00000160072 ENST00000673477 ENSP00000500094 Q5T9A4-1 Uniprot_isoform DIRECT - - -
2 ENSG00000160072 ENST00000308647 ENSP00000311766 A0A5K1VW56 Uniprot/SPTREMBL DIRECT 100 100 -
3 ENSG00000142611 ENST00000511072 ENSP00000426975 D6RDW0 Uniprot/SPTREMBL DIRECT 100 100 -
4 ENSG00000142611 ENST00000378391 ENSP00000367643 Q9HAZ2 Uniprot/SWISSPROT DIRECT - - -
... ... ... ... ... ... ... ... ... ...
158326 ENSG00000198695 ENST00000361681 ENSP00000354665 P03923 Uniprot/SWISSPROT DIRECT 100 100 -
158327 ENSG00000198695 ENST00000361681 ENSP00000354665 A0A1X7RCR6 Uniprot/SPTREMBL SEQUENCE_MATCH 100 100 -
158328 ENSG00000198695 ENST00000361681 ENSP00000354665 U5Z977 Uniprot/SPTREMBL SEQUENCE_MATCH 100 100 -
158329 ENSG00000198727 ENST00000361789 ENSP00000354554 P00156 Uniprot/SWISSPROT DIRECT 100 100 -
158330 ENSG00000198727 ENST00000361789 ENSP00000354554 Q0ZFD6 Uniprot/SPTREMBL SEQUENCE_MATCH 100 100 -

158331 rows × 9 columns