import numpy as np
import pandas as pd
import requests
import json
cookies = {
'ncbi_sid': '219124B75BC5FA71_0000SID',
'_ga': 'GA1.4.1173892900.1706844052',
'entrezSort': 'protein:',
'_ga_CSLL4ZEK4L': 'deleted',
'pmc-frontend-csrftoken': '6qU2CN4CXtFC1qef70L0fvgdBuyrzYzR',
'QSI_SI_0HhBb7Qmlxy2ZIF_intercept': 'true',
'_ga_DP2X732JSX': 'deleted',
'_ga_CSLL4ZEK4L': 'deleted',
'QSI_SI_blp2VywNohCsrQy_intercept': 'true',
'gdh-data-hub-csrftoken': 'xTpBPC3d7tpLZ7rJy2aqVOl77XXl5KMC',
'QSI_SI_74HHTwHyXgGIyqO_intercept': 'true',
'QSI_SI_9p2ci2cOSG1dkA5_intercept': 'true',
'QSI_SI_2uf767hGqQGdwdn_intercept': 'true',
'django-bibliography-csrftoken': 'ib5E3BiSxGlDcRsW6llQvlloMzuw9O1G',
'sessionid': 'e30:1sN1PJ:5K4DkLgAmH9_oJfMYC-hnOigQxV8tWkeRnNYWb5E02o',
'_gid': 'GA1.2.853419842.1719640079',
'_ce.clock_data': '-3021%2C212.87.194.24%2C1%2Cf1f6b29a6cc1f79a0fea05b885aa33d0%2CChrome%2CJP',
'_ga_8H17ZWYV40': 'GS1.1.1719644609.2.0.1719644609.0.0.0',
'_ga_DTBLWJFHPH': 'GS1.1.1719644609.2.0.1719644609.0.0.0',
'_gat_dap': '1',
'_ce.irv': 'returning',
'cebs': '1',
'cebsp_': '1',
'_ce.s': 'v~1af29eb7bcbad5aed25d98f057e21c8146ef0b42~lcw~1719644703645~lva~1719644699844~vpv~23~v11.cs~156325~v11.s~e63ff9b0-35e5-11ef-a3fa-e9d125cdf4f8~v11.sla~1719644704299~v11.send~1719644702165~lcw~1719644704299',
'_gat_ncbiSg': '1',
'_ga_DP2X732JSX': 'GS1.1.1719640079.63.1.1719644728.0.0.0',
'_ga_CSLL4ZEK4L': 'GS1.1.1719640080.71.1.1719644729.0.0.0',
'ncbi_pinger': 'N4IgDgTgpgbg+mAFgSwCYgFwgMwCZcDCAItgJykCiAHAaQGLUCsADK8wGwCMALBdtiwIBBAOxUAdJ3EBbOJxABfIA===',
'_ga': 'GA1.2.1173892900.1706844052',
}
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'api-key': '27cc0625ebd9931baf17439668edbef05c09',
'content-type': 'application/json',
'ncbi-phid': '322CD399E8C9FE850000614E3350CA78.1.m_1.03',
'origin': 'https://www.ncbi.nlm.nih.gov',
'priority': 'u=1, i',
'referer': 'https://www.ncbi.nlm.nih.gov/datasets/gene/GCF_000001405.40/',
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'x-csrftoken': 'xTpBPC3d7tpLZ7rJy2aqVOl77XXl5KMC',
}
json_data = {
'accession': 'GCF_000001405.40',
'page_size': 1000,
'gene_types': [],
'search_text': [],
'locations': [],
'page_token': '',
'sort': [],
}
count = 0
row_id, chromosomes, gene_id, gene_type, gene_accession_version, gene_begin, gene_end, gene_orientation, gene_name, gene_symbol, transcripts_accession_version, transcripts_name = [], [], [], [], [], [], [], [], [], [], [], []
while count < 68253:
response = requests.post(
'https://www.ncbi.nlm.nih.gov/datasets/api/datasets/v2alpha/genome/annotation_report',
cookies=cookies,
headers=headers,
json=json_data,
)
result = json.loads(response.text)
for report in result["reports"]:
print(report)
row_id.append(report["row_id"])
gene_id.append(report["annotation"]["gene_id"])
gene_type.append(report["annotation"]["gene_type"])
gene_accession_version.append(report["annotation"]["genomic_regions"][0]["gene_range"]["accession_version"])
gene_begin.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["begin"])
gene_end.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["end"])
gene_orientation.append(report["annotation"]["genomic_regions"][0]["gene_range"]["range"][0]["orientation"])
gene_name.append(report["annotation"]["name"])
gene_symbol.append(report["annotation"]["symbol"])
try:
chromosomes.append(report["annotation"]["chromosomes"][0])
except KeyError as e:
chromosomes.append(None)
try:
transcripts_accession_version.append(report["annotation"]["transcripts"][0]["accession_version"])
transcripts_name.append(report["annotation"]["transcripts"][0]["name"])
except KeyError as e:
transcripts_accession_version.append(None)
transcripts_name.append(None)
try:
next_page_token = result["next_page_token"]
json_data = {
'accession': 'GCF_000001405.40',
'page_size': 1000,
'gene_types': [],
'search_text': [],
'locations': [],
'page_token': next_page_token,
'sort': [],
}
count += 1000
except KeyError as e:
count += 1000
pd.DataFrame(np.array([
row_id, chromosomes, gene_id, gene_type, gene_accession_version, gene_begin, gene_end, gene_orientation, gene_name,
gene_symbol, transcripts_accession_version, transcripts_name
]).T, columns=["row_id", "chromosomes", "gene_id", "gene_type", "gene_accession_version", "gene_begin", "gene_end",
"gene_orientation", "gene_name", "gene_symbol", "transcripts_accession_version",
"transcripts_name"]).to_csv("./GCF_000001405.40.csv", index=False)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步