获取NCBI上人类基因信息

import numpy as np
import pandas as pd
import requests
import json

cookies = {
    'ncbi_sid': '219124B75BC5FA71_0000SID',
    '_ga': 'GA1.4.1173892900.1706844052',
    'entrezSort': 'protein:',
    '_ga_CSLL4ZEK4L': 'deleted',
    'pmc-frontend-csrftoken': '6qU2CN4CXtFC1qef70L0fvgdBuyrzYzR',
    'QSI_SI_0HhBb7Qmlxy2ZIF_intercept': 'true',
    '_ga_DP2X732JSX': 'deleted',
    '_ga_CSLL4ZEK4L': 'deleted',
    'QSI_SI_blp2VywNohCsrQy_intercept': 'true',
    'gdh-data-hub-csrftoken': 'xTpBPC3d7tpLZ7rJy2aqVOl77XXl5KMC',
    'QSI_SI_74HHTwHyXgGIyqO_intercept': 'true',
    'QSI_SI_9p2ci2cOSG1dkA5_intercept': 'true',
    'QSI_SI_2uf767hGqQGdwdn_intercept': 'true',
    'django-bibliography-csrftoken': 'ib5E3BiSxGlDcRsW6llQvlloMzuw9O1G',
    'sessionid': 'e30:1sN1PJ:5K4DkLgAmH9_oJfMYC-hnOigQxV8tWkeRnNYWb5E02o',
    '_gid': 'GA1.2.853419842.1719640079',
    '_ce.clock_data': '-3021%2C212.87.194.24%2C1%2Cf1f6b29a6cc1f79a0fea05b885aa33d0%2CChrome%2CJP',
    '_ga_8H17ZWYV40': 'GS1.1.1719644609.2.0.1719644609.0.0.0',
    '_ga_DTBLWJFHPH': 'GS1.1.1719644609.2.0.1719644609.0.0.0',
    '_gat_dap': '1',
    '_ce.irv': 'returning',
    'cebs': '1',
    'cebsp_': '1',
    '_ce.s': 'v~1af29eb7bcbad5aed25d98f057e21c8146ef0b42~lcw~1719644703645~lva~1719644699844~vpv~23~v11.cs~156325~v11.s~e63ff9b0-35e5-11ef-a3fa-e9d125cdf4f8~v11.sla~1719644704299~v11.send~1719644702165~lcw~1719644704299',
    '_gat_ncbiSg': '1',
    '_ga_DP2X732JSX': 'GS1.1.1719640079.63.1.1719644728.0.0.0',
    '_ga_CSLL4ZEK4L': 'GS1.1.1719640080.71.1.1719644729.0.0.0',
    'ncbi_pinger': 'N4IgDgTgpgbg+mAFgSwCYgFwgMwCZcDCAItgJykCiAHAaQGLUCsADK8wGwCMALBdtiwIBBAOxUAdJ3EBbOJxABfIA===',
    '_ga': 'GA1.2.1173892900.1706844052',
}

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'api-key': '27cc0625ebd9931baf17439668edbef05c09',
    'content-type': 'application/json',
    # 'cookie': 'ncbi_sid=219124B75BC5FA71_0000SID; _ga=GA1.4.1173892900.1706844052; entrezSort=protein:; _ga_CSLL4ZEK4L=deleted; pmc-frontend-csrftoken=6qU2CN4CXtFC1qef70L0fvgdBuyrzYzR; QSI_SI_0HhBb7Qmlxy2ZIF_intercept=true; _ga_DP2X732JSX=deleted; _ga_CSLL4ZEK4L=deleted; QSI_SI_blp2VywNohCsrQy_intercept=true; gdh-data-hub-csrftoken=xTpBPC3d7tpLZ7rJy2aqVOl77XXl5KMC; QSI_SI_74HHTwHyXgGIyqO_intercept=true; QSI_SI_9p2ci2cOSG1dkA5_intercept=true; QSI_SI_2uf767hGqQGdwdn_intercept=true; django-bibliography-csrftoken=ib5E3BiSxGlDcRsW6llQvlloMzuw9O1G; sessionid=e30:1sN1PJ:5K4DkLgAmH9_oJfMYC-hnOigQxV8tWkeRnNYWb5E02o; _gid=GA1.2.853419842.1719640079; _ce.clock_data=-3021%2C212.87.194.24%2C1%2Cf1f6b29a6cc1f79a0fea05b885aa33d0%2CChrome%2CJP; _ga_8H17ZWYV40=GS1.1.1719644609.2.0.1719644609.0.0.0; _ga_DTBLWJFHPH=GS1.1.1719644609.2.0.1719644609.0.0.0; _gat_dap=1; _ce.irv=returning; cebs=1; cebsp_=1; _ce.s=v~1af29eb7bcbad5aed25d98f057e21c8146ef0b42~lcw~1719644703645~lva~1719644699844~vpv~23~v11.cs~156325~v11.s~e63ff9b0-35e5-11ef-a3fa-e9d125cdf4f8~v11.sla~1719644704299~v11.send~1719644702165~lcw~1719644704299; _gat_ncbiSg=1; _ga_DP2X732JSX=GS1.1.1719640079.63.1.1719644728.0.0.0; _ga_CSLL4ZEK4L=GS1.1.1719640080.71.1.1719644729.0.0.0; ncbi_pinger=N4IgDgTgpgbg+mAFgSwCYgFwgMwCZcDCAItgJykCiAHAaQGLUCsADK8wGwCMALBdtiwIBBAOxUAdJ3EBbOJxABfIA===; _ga=GA1.2.1173892900.1706844052',
    'ncbi-phid': '322CD399E8C9FE850000614E3350CA78.1.m_1.03',
    'origin': 'https://www.ncbi.nlm.nih.gov',
    'priority': 'u=1, i',
    'referer': 'https://www.ncbi.nlm.nih.gov/datasets/gene/GCF_000001405.40/',
    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    'x-csrftoken': 'xTpBPC3d7tpLZ7rJy2aqVOl77XXl5KMC',
}

json_data = {
    'accession': 'GCF_000001405.40',
    'page_size': 1000,
    'gene_types': [],
    'search_text': [],
    'locations': [],
    'page_token': '',
    'sort': [],
}
count = 0
row_id, chromosomes, gene_id, gene_type, gene_accession_version, gene_begin, gene_end, gene_orientation, gene_name, gene_symbol, transcripts_accession_version, transcripts_name = [], [], [], [], [], [], [], [], [], [], [], []
# 所有基因有68253个
while count < 68253:
    response = requests.post(
        'https://www.ncbi.nlm.nih.gov/datasets/api/datasets/v2alpha/genome/annotation_report',
        cookies=cookies,
        headers=headers,
        json=json_data,
    )
    result = json.loads(response.text)
    for report in result["reports"]:
        print(report)
        row_id.append(report["row_id"])
        gene_id.append(report["annotation"]["gene_id"])
        gene_type.append(report["annotation"]["gene_type"])
        gene_accession_version.append(report["annotation"]["genomic_regions"][0]["gene_range"]["accession_version"])
        gene_begin.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["begin"])
        gene_end.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["end"])
        gene_orientation.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["orientation"])
        gene_name.append(report["annotation"]["name"])
        gene_symbol.append(report["annotation"]["symbol"])
        try:
            chromosomes.append(report["annotation"]["chromosomes"][0])
        except KeyError as e:
            chromosomes.append(None)
        try:
            transcripts_accession_version.append(report["annotation"]["transcripts"][0]["accession_version"])
            transcripts_name.append(report["annotation"]["transcripts"][0]["name"])
        except KeyError as e:
            transcripts_accession_version.append(None)
            transcripts_name.append(None)
    try:
        next_page_token = result["next_page_token"]
        json_data = {
            'accession': 'GCF_000001405.40',
            'page_size': 1000,
            'gene_types': [],
            'search_text': [],
            'locations': [],
            'page_token': next_page_token,
            'sort': [],
        }
        count += 1000
    except KeyError as e:
        count += 1000

pd.DataFrame(np.array([
    row_id, chromosomes, gene_id, gene_type, gene_accession_version, gene_begin, gene_end, gene_orientation, gene_name,
    gene_symbol, transcripts_accession_version, transcripts_name
]).T, columns=["row_id", "chromosomes", "gene_id", "gene_type", "gene_accession_version", "gene_begin", "gene_end",
               "gene_orientation", "gene_name", "gene_symbol", "transcripts_accession_version",
               "transcripts_name"]).to_csv("./GCF_000001405.40.csv", index=False)