import scrapy
from urllib import parse
import os
import requests
import re
import json
from ..items import PeriodicalItem
from ..items import PatentItem
from ..items import CstadItem
from ..items import ConferenceItem
from ..items import ThesisItem
from .. import test_pb2 as pb
import pandas as pd
import numpy as np
class DoctorSpider(scrapy.Spider):
name = 'doctor'
allowed_domains = ['wanfangdata.com.cn']
def start_requests(self):
new = pd.read_csv('12.csv', engine='python', encoding='utf-8')
new.drop(labels=new.columns[0],axis=1,inplace=True)
new_ = np.array(new)
new_lists = new_.tolist()
# print(new_lists)
for i in new_lists:
key = str(i).replace('[','').replace(']','').replace('\'','').replace(' ','').replace(',',' ')
print(key)
page = 1
search_request = pb.SearchService.SearchRequest()
search_request.commonrequest.searchType = "paper"
search_request.commonrequest.searchWord = key
search_request.commonrequest.currentPage = page
search_request.commonrequest.pageSize = 50
search_request.commonrequest.searchFilter.append(0)
search_request.interfaceType = 1
cookies = {
'wfpub_token': '26982e2d-03b6-4326-a612-260e93fbe56a',
'rememberpwd_aabbccqwe': 'QiwpuOQe8O7wtHfxwoUN^%^2BfXGJpfbOhqdCyFqrOhAzE4zqSFmpS0S87TDWAZdW8l60ZOrMerys9xI^%^0A22Xb4pVjXCJJqB^%^2BRhogO',
'zh_choose': 'n',
'firstvisit_backurl': 'http^%^3A//www.wanfangdata.com.cn',
'Hm_lvt_838fbc4154ad87515435bf1e10023fab': '1609314875,1609377029,1609403937,1609722037',
'token': 'TGT-8577286-L0bG0ecg6MRkafGuCpfubZsPo3JAMt3INq5JFfDovcRidS3kGJ-my.wanfangdata.com.cn',
'sync_login_wfpub_token': '0179f7fd-7a9d-4b6a-a125-755370240c6c',
'noReadCollection': '0',
'WFKS.Auth': '^%^7B^%^22Context^%^22^%^3A^%^7B^%^22AccountIds^%^22^%^3A^%^5B^%^22Group.hnszyxy^%^22^%^2C^%^22GTimeLimit.hnszyxy^%^22^%^5D^%^2C^%^22Data^%^22^%^3A^%^5B^%^7B^%^22Key^%^22^%^3A^%^22Group.hnszyxy.DisplayName^%^22^%^2C^%^22Value^%^22^%^3A^%^22^%^E6^%^B2^%^B3^%^E5^%^8D^%^97^%^E4^%^B8^%^AD^%^E5^%^8C^%^BB^%^E8^%^8D^%^AF^%^E5^%^A4^%^A7^%^E5^%^AD^%^A6^%^22^%^7D^%^5D^%^2C^%^22SessionId^%^22^%^3A^%^22ce94c89a-f806-487c-aeeb-9d0b270c1bd8^%^22^%^2C^%^22Sign^%^22^%^3A^%^22qKfj844t^%^5C^%^2FWqgLJmLugRk1t5kl7iGuK5^%^5C^%^2F^%^5C^%^2FMn6m4mGwR2d74bDih4^%^5C^%^2F^%^2B1AK1AO0uhJ2^%^22^%^7D^%^2C^%^22LastUpdate^%^22^%^3A^%^222021-01-04T07^%^3A44^%^3A50Z^%^22^%^2C^%^22TicketSign^%^22^%^3A^%^22HsitxZoK82RjKW6xdUMZSQ^%^3D^%^3D^%^22^%^2C^%^22UserIp^%^22^%^3Anull^%^7D',
'SEARCHHISTORY_0': 'UEsDBBQACAgIAJl9JFIAAAAAAAAAAAAAAAABAAAAMO2d^%^2FW^%^2FbxhnH^%^2F5VCAIMWSxPyeEfeGTAKUS^%^2BO^%^0AZfklfreX^%^2FUBLtCVbbyYpK3JRwNualzqLOy^%^2FNmjSOG7RZjWBJ0KzB6rRJ^%^2B8fMlKz^%^2FYneUEr^%^2BJbJQt^%^0AikwTkKUj73iS7^%^2Fvh6bnnnjv9^%^2FuOAqc5ktAE1qwW6csVM5nQgnQx0Babmx8oYgiJGYuB0oGhoem^%^2Fy^%^0AZQFDU^%^2FVEarRcoJcINFPP0AtSplnoOnvWOFNSc7Nqbi6pmuqZRD57JpE7W1ALmv7RYjcXgZyCuWCY^%^0AiyCOKJwSYQmMOSxxEZlTBE5R7DNRTpE5MTj8IQ^%^2Fou2sXC7pmGOl8jr7PzvYj684v1tXL1Sc^%^2FW5^%^2F^%^2F^%^0A2NUoY9qfJlDZ^%^2BNq6uvqflT9aj77beXGdJnaef7X7^%^2BDFLbN^%^2Bwrj6gierWemV1pbL6D2t9de^%^2Fw6l8r^%^0Amxv0sHLvinXlMkv8cHN361NW1cb3lb8^%^2Fs379kqZ3ty7XvllnJ^%^2B^%^2F8svPsO^%^2FrGuWI2mi^%^2FmaOOIvCSd^%^0ADiR0TTW10TRrT0HiiQyRJAm0GT857dTWsTwZHB5V^%^2B0UitdTWpVKpWWvXL2m8xNOGeSaZ^%^2F2ivnm41^%^0AkzllpPIl^%^2B^%^2BBUQZ3TRtLLNFUvMpHXk91MBKoSIVxE4jBNyxxP04TDIkcElkv1IbZ6Qchy6^%^2BWDtozB^%^0AEHvUpcY8u4rWg6F9ldC4Codsqe2sIDiVNkb19Nycpo^%^2Bqc92HFbce3a5c2rpg1v68bv24ad3fotpW^%^0AHz7c2V6p3f7UerJp^%^2FeXn2u1vOoQBQTyEACIEIUwkQtV1RmBRDEV6lktIEIiPgFcR4Km6zghEwmkd^%^0A5aY0AQAfAY8igCSqrjMCA4n^%^2BqfJwIglEHwHPIiBSdZ0RODcQPV8azoyI2P8i8CwCvEh4ZwTkeSVu^%^0A9oN^%^2BUfYJ8CgBUOZF7NIJ9IRJTETTiEc^%^2BAl5FAFJ1nRGIKhMLC^%^2F3zZSRDHwGvIsBTdZ0RkJSlhfLI^%^0AUhLLvlvAqwiIMlXXGYHcRDEvhpUlAngfAY8iAAhV1xmBopgwwjBUlEQfAY8igIlA1XVGYFyZgblx^%^0AFQvtGxDsV5tqRdWmotmKYdTFAWBrKttn97n0KQwKpLm^%^2FO3qRnU3PoEYWDtereVvUcCD6ppXbCaVe^%^0AeZDDwX2FX1V^%^2BdFrixcbuyqWuCwFra4NNUVx6eiHwXv2kdf0mJY1muTN69mj2zvZT66dVekizLwQ6^%^0AA2KBPzzJ0YBYEnhBdoY4pScGYvL0hIwEvx^%^2FzZj8GMaDqOiPAx2PTOKaVoOx7NryKgISous4IlEEW^%^0A9OHBaeiPaTyKgCxLPHQb0wyP9I5H^%^2BWS^%^2FLGAfAU8iQAEQqbrOCGQjojkaHcDvdqoT8AcheDd6Hwed^%^0AITXuUDOlRYB51xnNTFwn45OCJr5LR7Yv9P8qNKQPwIsu^%^2FurseDIb7ycikluLF3tDpU0tkdqTOqmZ^%^0AajrjLPgx7t73K31AKr65TAKmCrjEGZV7^%^2BsdyE^%^2Bdp8bbJ5CDL^%^2B29Pgg84gABPn2gOQDhMnwhL0ToA^%^0AUsIvz4lBdk5i54RXRdhT0C4HDxzSD^%^2FXykF0blI7glzbOpc3u^%^2BstYjiVn07phBotmKq93z6oZQzuV^%^0AyOvakKan841jNUM1TJaVYnlYM^%^2FJFPaE1MnTa0tqQqlMB7W4tpy6N7LXrYared8fpg^%^2FcaPojKk5vW^%^0A^%^2Faf1Yq^%^2FBmNCUMVGGNOXMGER98dJiLyFt6Qle212F6c1PbLAiHJZtVmg5sTV31aH41fA^%^2B9vYTKzeo^%^0APuQ2YrUDTomyMgQz1FlhniP1BOEUye6jCPsQbj6my59Zj6^%^2FU7t446mOyw2NfUNkZCFvrdd^%^2FR7p9^%^2B^%^0Ard66X733RfXBvyufrXSKK0luZj1SviTME5c^%^2BrBQ1h^%^2FSpixMAtC9E5nUAU0JMSyY21Zg^%^2Fllw9v1f7^%^0A2^%^2FfHGyqx^%^2BRejiDDlxRmq3uXEmKJLMdRGx8RvQyVzmHZWChOP6krTLJtqfyx7reratcqttd3r3x5v^%^0AwAS^%^2BOWGAQOTm9xAWxssTBhombRnz^%^2Bgayg0oC4YnLgFUcOY9Eido4^%^2FiyFJ51TkI6PCC^%^2B7EACl9Fxh^%^0AIT4J^%^2BLYg4H6jHifHRYvDCABk2sTOOqSjofnJyLkYLezr8BZ1EOkfcfHspBK9hcVBpSRLbbGMTq4O^%^0AkihLLlNncaM4szgw1Ct0gIPNyzLIAi^%^2B4jD5lyQj1xuQ4ltoyiX2CdZCx2^%^2B0wHe2RexOzWSi2fxrR^%^0AXQbfXvt^%^2F2msEAQDdFktE1bCC1KUZAtrnb^%^2FTnmN7KHBNGVERnpeeEeD6fmpzyQ0c8easjJEPI824u^%^0AFD4VmSpML^%^2Frh8F5FQJJ45BoOv7woJUYLyznUngnMVv20tv81WHfY2vuYnOxg6OraNev6Pys3P^%^2Fdq^%^0AMLTcPAoSichthjR7fnR2fmBRQEL7wmJ8iH2IW4MY85RPl1CS^%^2FnjvQMGIIb59QXw^%^2BxD7ELUGMEKB8^%^0AuvTEsFCYWYpkxDYOHn2IfYhbghhjtmmMM8QhZa7PHB6VBdJJNvGrgZFNswJOOLt0cFZd^%^2B5dHwQWC^%^0Aw6JAAimULitbo^%^2BXcxNQSFPhOItfvfv3u98AmDVAQXFwSqGcyFl0aNJHUSYawxMJbWcQYZFKTIFM4^%^0AKDJSTjbElbtrOz99Vft286RBTJDkNq0Jh5Z7VDCC2zid5tsQvg3x2zYEgQC7zf^%^2FJc1CJlIfjhPjb^%^0AYxwDhE^%^2Fo9hgE0leXcIbY1JAKy5GLCHWSEVEX3V4so4Rsa4KwJTNsBdSJpnhn^%^2B0rt^%^2BVrtxiOvUixi^%^0AB4qZR80lVlBCkZF0cRCJnbSKgrBFCkx92yZudWmO5^%^2BCtPXhYudtklY8nyMXAYc9dgTLpDC4Zi2TJ^%^0AspCX27jt8us4IghkS25YbAHP2GUERzgin3CCq1^%^2FcqqzcrWx^%^2B6VGIkRPEkPLpDHG4L1UYiaJZCNu3^%^0AZ2wL3jTMdFZaXLrmPXbXru2u^%^2FuBRcAXSdJsQSi6G0GUueSBh9qGRmXkEO6n7lezVksTODnEk6hsQ^%^0ADRfanWc7z7^%^2F2rg0hOnS^%^2FElsq4wyxNjFtakZPArS4sVXTn8wyU5qRNt7sN7OEI8od^%^2Fc0sYa^%^2Bp6y18^%^0A8AvocEy^%^2B^%^2FUNWEiA8^%^2Fec^%^2B^%^2BcN^%^2FAVBLBwg6zwtu3AgAADZsAAA^%^3D^%^0A',
'CASTGC': 'TGT-8918879-zwRiVbYXCSHisu3Y9vYFFZ1zFV4kS1pldjhTjmVfIaoB0M6BBq-my.wanfangdata.com.cn',
'Hm_lpvt_838fbc4154ad87515435bf1e10023fab': '1609746447',
}
headers = {
'Proxy-Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'X-User-Agent': 'grpc-web-javascript/0.1',
'X-Grpc-Web': '1',
'Content-Type': 'application/grpc-web+proto',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept': '*/*',
'Origin': 'http://s.wanfangdata.com.cn',
'Referer': 'http://s.wanfangdata.com.cn/thesis?q=^%^E4^%^B8^%^AD^%^E5^%^9B^%^BE^%^E5^%^88^%^86^%^E7^%^B1^%^BB^%^E5^%^8F^%^B7^%^3AR-01',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
bytes_body = search_request.SerializeToString()
bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
data = bytes_head+bytes_body
response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
res = response.text
# print(res)
pattern = r"Ch[A-Za-z0-9%]+"
# 【conference】会议论文:ChZDb
# 【periodical】期刊论文:ChlQZ
# 【thesis】硕士论文:ChJUa
# 【patent】专利:ChJQY
# 【cstad】成果:ChFDc
res1 = re.compile(pattern).findall(res)
m = 0
for i in res1:
if len(i) > 20:
# print('一:这是第{}页的内容--------------------------------------------------------------------------------'.format(page))
m += 1
# print(m)
# print(i)
data = {
'Id': i
}
data = json.dumps(data)
if 'ChZDb' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
elif 'ChlQZ' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
elif 'ChJUa' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
elif 'ChJQY' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
elif 'ChFDc' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
while res1 != []:
page += 1
search_request.commonrequest.currentPage = page
bytes_body = search_request.SerializeToString()
bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
data = bytes_head+bytes_body
response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
res = response.text
pattern = r"Ch[A-Za-z0-9%]+"
res1 = re.compile(pattern).findall(res)
for i in res1:
if len(i) > 20:
# print('二:这是第{}页的内容========================================================================================'.format(page))
m += 1
# print(m)
# print(i)
data = {
'Id': i
}
data = json.dumps(data)
if 'ChZDb' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
elif 'ChlQZ' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
elif 'ChJUa' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
elif 'ChJQY' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
elif 'ChFDc' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
def double_requests(self,response):
print('·································································这是第二部分循环引用下作者和单位获取的详情信息·······················································································')
key = response.meta['key']
print(key)
page = 1
search_request = pb.SearchService.SearchRequest()
search_request.commonrequest.searchType = "paper"
search_request.commonrequest.searchWord = key
search_request.commonrequest.currentPage = page
search_request.commonrequest.pageSize = 50
search_request.commonrequest.searchFilter.append(0)
search_request.interfaceType = 1
cookies = {
'wfpub_token': '26982e2d-03b6-4326-a612-260e93fbe56a',
'rememberpwd_aabbccqwe': 'QiwpuOQe8O7wtHfxwoUN^%^2BfXGJpfbOhqdCyFqrOhAzE4zqSFmpS0S87TDWAZdW8l60ZOrMerys9xI^%^0A22Xb4pVjXCJJqB^%^2BRhogO',
'zh_choose': 'n',
'firstvisit_backurl': 'http^%^3A//www.wanfangdata.com.cn',
'Hm_lvt_838fbc4154ad87515435bf1e10023fab': '1609314875,1609377029,1609403937,1609722037',
'token': 'TGT-8577286-L0bG0ecg6MRkafGuCpfubZsPo3JAMt3INq5JFfDovcRidS3kGJ-my.wanfangdata.com.cn',
'sync_login_wfpub_token': '0179f7fd-7a9d-4b6a-a125-755370240c6c',
'noReadCollection': '0',
'WFKS.Auth': '^%^7B^%^22Context^%^22^%^3A^%^7B^%^22AccountIds^%^22^%^3A^%^5B^%^22Group.hnszyxy^%^22^%^2C^%^22GTimeLimit.hnszyxy^%^22^%^5D^%^2C^%^22Data^%^22^%^3A^%^5B^%^7B^%^22Key^%^22^%^3A^%^22Group.hnszyxy.DisplayName^%^22^%^2C^%^22Value^%^22^%^3A^%^22^%^E6^%^B2^%^B3^%^E5^%^8D^%^97^%^E4^%^B8^%^AD^%^E5^%^8C^%^BB^%^E8^%^8D^%^AF^%^E5^%^A4^%^A7^%^E5^%^AD^%^A6^%^22^%^7D^%^5D^%^2C^%^22SessionId^%^22^%^3A^%^22ce94c89a-f806-487c-aeeb-9d0b270c1bd8^%^22^%^2C^%^22Sign^%^22^%^3A^%^22qKfj844t^%^5C^%^2FWqgLJmLugRk1t5kl7iGuK5^%^5C^%^2F^%^5C^%^2FMn6m4mGwR2d74bDih4^%^5C^%^2F^%^2B1AK1AO0uhJ2^%^22^%^7D^%^2C^%^22LastUpdate^%^22^%^3A^%^222021-01-04T07^%^3A44^%^3A50Z^%^22^%^2C^%^22TicketSign^%^22^%^3A^%^22HsitxZoK82RjKW6xdUMZSQ^%^3D^%^3D^%^22^%^2C^%^22UserIp^%^22^%^3Anull^%^7D',
'SEARCHHISTORY_0': 'UEsDBBQACAgIAJl9JFIAAAAAAAAAAAAAAAABAAAAMO2d^%^2FW^%^2FbxhnH^%^2F5VCAIMWSxPyeEfeGTAKUS^%^2BO^%^0AZfklfreX^%^2FUBLtCVbbyYpK3JRwNualzqLOy^%^2FNmjSOG7RZjWBJ0KzB6rRJ^%^2B8fMlKz^%^2FYneUEr^%^2BJbJQt^%^0AikwTkKUj73iS7^%^2Fvh6bnnnjv9^%^2FuOAqc5ktAE1qwW6csVM5nQgnQx0Babmx8oYgiJGYuB0oGhoem^%^2Fy^%^0AZQFDU^%^2FVEarRcoJcINFPP0AtSplnoOnvWOFNSc7Nqbi6pmuqZRD57JpE7W1ALmv7RYjcXgZyCuWCY^%^0AiyCOKJwSYQmMOSxxEZlTBE5R7DNRTpE5MTj8IQ^%^2Fou2sXC7pmGOl8jr7PzvYj684v1tXL1Sc^%^2FW5^%^2F^%^2F^%^0A2NUoY9qfJlDZ^%^2BNq6uvqflT9aj77beXGdJnaef7X7^%^2BDFLbN^%^2Bwrj6gierWemV1pbL6D2t9de^%^2Fw6l8r^%^0Amxv0sHLvinXlMkv8cHN361NW1cb3lb8^%^2Fs379kqZ3ty7XvllnJ^%^2B^%^2F8svPsO^%^2FrGuWI2mi^%^2FmaOOIvCSd^%^0ADiR0TTW10TRrT0HiiQyRJAm0GT857dTWsTwZHB5V^%^2B0UitdTWpVKpWWvXL2m8xNOGeSaZ^%^2F2ivnm41^%^0AkzllpPIl^%^2B^%^2BBUQZ3TRtLLNFUvMpHXk91MBKoSIVxE4jBNyxxP04TDIkcElkv1IbZ6Qchy6^%^2BWDtozB^%^0AEHvUpcY8u4rWg6F9ldC4Codsqe2sIDiVNkb19Nycpo^%^2Bqc92HFbce3a5c2rpg1v68bv24ad3fotpW^%^0AHz7c2V6p3f7UerJp^%^2FeXn2u1vOoQBQTyEACIEIUwkQtV1RmBRDEV6lktIEIiPgFcR4Km6zghEwmkd^%^0A5aY0AQAfAY8igCSqrjMCA4n^%^2BqfJwIglEHwHPIiBSdZ0RODcQPV8azoyI2P8i8CwCvEh4ZwTkeSVu^%^0A9oN^%^2BUfYJ8CgBUOZF7NIJ9IRJTETTiEc^%^2BAl5FAFJ1nRGIKhMLC^%^2F3zZSRDHwGvIsBTdZ0RkJSlhfLI^%^0AUhLLvlvAqwiIMlXXGYHcRDEvhpUlAngfAY8iAAhV1xmBopgwwjBUlEQfAY8igIlA1XVGYFyZgblx^%^0AFQvtGxDsV5tqRdWmotmKYdTFAWBrKttn97n0KQwKpLm^%^2FO3qRnU3PoEYWDtereVvUcCD6ppXbCaVe^%^0AeZDDwX2FX1V^%^2BdFrixcbuyqWuCwFra4NNUVx6eiHwXv2kdf0mJY1muTN69mj2zvZT66dVekizLwQ6^%^0AA2KBPzzJ0YBYEnhBdoY4pScGYvL0hIwEvx^%^2FzZj8GMaDqOiPAx2PTOKaVoOx7NryKgISous4IlEEW^%^0A9OHBaeiPaTyKgCxLPHQb0wyP9I5H^%^2BWS^%^2FLGAfAU8iQAEQqbrOCGQjojkaHcDvdqoT8AcheDd6Hwed^%^0AITXuUDOlRYB51xnNTFwn45OCJr5LR7Yv9P8qNKQPwIsu^%^2FurseDIb7ycikluLF3tDpU0tkdqTOqmZ^%^0AajrjLPgx7t73K31AKr65TAKmCrjEGZV7^%^2BsdyE^%^2Bdp8bbJ5CDL^%^2B29Pgg84gABPn2gOQDhMnwhL0ToA^%^0AUsIvz4lBdk5i54RXRdhT0C4HDxzSD^%^2FXykF0blI7glzbOpc3u^%^2BstYjiVn07phBotmKq93z6oZQzuV^%^0AyOvakKan841jNUM1TJaVYnlYM^%^2FJFPaE1MnTa0tqQqlMB7W4tpy6N7LXrYared8fpg^%^2FcaPojKk5vW^%^0A^%^2Faf1Yq^%^2FBmNCUMVGGNOXMGER98dJiLyFt6Qle212F6c1PbLAiHJZtVmg5sTV31aH41fA^%^2B9vYTKzeo^%^0APuQ2YrUDTomyMgQz1FlhniP1BOEUye6jCPsQbj6my59Zj6^%^2FU7t446mOyw2NfUNkZCFvrdd^%^2FR7p9^%^2B^%^0Ard66X733RfXBvyufrXSKK0luZj1SviTME5c^%^2BrBQ1h^%^2FSpixMAtC9E5nUAU0JMSyY21Zg^%^2Fllw9v1f7^%^0A2^%^2FfHGyqx^%^2BRejiDDlxRmq3uXEmKJLMdRGx8RvQyVzmHZWChOP6krTLJtqfyx7reratcqttd3r3x5v^%^0AwAS^%^2BOWGAQOTm9xAWxssTBhombRnz^%^2Bgayg0oC4YnLgFUcOY9Eido4^%^2FiyFJ51TkI6PCC^%^2B7EACl9Fxh^%^0AIT4J^%^2BLYg4H6jHifHRYvDCABk2sTOOqSjofnJyLkYLezr8BZ1EOkfcfHspBK9hcVBpSRLbbGMTq4O^%^0AkihLLlNncaM4szgw1Ct0gIPNyzLIAi^%^2B4jD5lyQj1xuQ4ltoyiX2CdZCx2^%^2B0wHe2RexOzWSi2fxrR^%^0AXQbfXvt^%^2F2msEAQDdFktE1bCC1KUZAtrnb^%^2FTnmN7KHBNGVERnpeeEeD6fmpzyQ0c8easjJEPI824u^%^0AFD4VmSpML^%^2Frh8F5FQJJ45BoOv7woJUYLyznUngnMVv20tv81WHfY2vuYnOxg6OraNev6Pys3P^%^2Fdq^%^0AMLTcPAoSichthjR7fnR2fmBRQEL7wmJ8iH2IW4MY85RPl1CS^%^2FnjvQMGIIb59QXw^%^2BxD7ELUGMEKB8^%^0AuvTEsFCYWYpkxDYOHn2IfYhbghhjtmmMM8QhZa7PHB6VBdJJNvGrgZFNswJOOLt0cFZd^%^2B5dHwQWC^%^0Aw6JAAimULitbo^%^2BXcxNQSFPhOItfvfv3u98AmDVAQXFwSqGcyFl0aNJHUSYawxMJbWcQYZFKTIFM4^%^0AKDJSTjbElbtrOz99Vft286RBTJDkNq0Jh5Z7VDCC2zid5tsQvg3x2zYEgQC7zf^%^2FJc1CJlIfjhPjb^%^0AYxwDhE^%^2Fo9hgE0leXcIbY1JAKy5GLCHWSEVEX3V4so4Rsa4KwJTNsBdSJpnhn^%^2B0rt^%^2BVrtxiOvUixi^%^0AB4qZR80lVlBCkZF0cRCJnbSKgrBFCkx92yZudWmO5^%^2BCtPXhYudtklY8nyMXAYc9dgTLpDC4Zi2TJ^%^0AspCX27jt8us4IghkS25YbAHP2GUERzgin3CCq1^%^2FcqqzcrWx^%^2B6VGIkRPEkPLpDHG4L1UYiaJZCNu3^%^0AZ2wL3jTMdFZaXLrmPXbXru2u^%^2FuBRcAXSdJsQSi6G0GUueSBh9qGRmXkEO6n7lezVksTODnEk6hsQ^%^0ADRfanWc7z7^%^2F2rg0hOnS^%^2FElsq4wyxNjFtakZPArS4sVXTn8wyU5qRNt7sN7OEI8od^%^2Fc0sYa^%^2Bp6y18^%^0A8AvocEy^%^2B^%^2FUNWEiA8^%^2Fec^%^2B^%^2BcN^%^2FAVBLBwg6zwtu3AgAADZsAAA^%^3D^%^0A',
'CASTGC': 'TGT-8918879-zwRiVbYXCSHisu3Y9vYFFZ1zFV4kS1pldjhTjmVfIaoB0M6BBq-my.wanfangdata.com.cn',
'Hm_lpvt_838fbc4154ad87515435bf1e10023fab': '1609746447',
}
headers = {
'Proxy-Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'X-User-Agent': 'grpc-web-javascript/0.1',
'X-Grpc-Web': '1',
'Content-Type': 'application/grpc-web+proto',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept': '*/*',
'Origin': 'http://s.wanfangdata.com.cn',
'Referer': 'http://s.wanfangdata.com.cn/thesis?q=^%^E4^%^B8^%^AD^%^E5^%^9B^%^BE^%^E5^%^88^%^86^%^E7^%^B1^%^BB^%^E5^%^8F^%^B7^%^3AR-01',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
bytes_body = search_request.SerializeToString()
bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
data = bytes_head+bytes_body
response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
res = response.text
# print(res)
pattern = r"Ch[A-Za-z0-9%]+"
# 【conference】会议论文:ChZDb
# 【periodical】期刊论文:ChlQZ
# 【thesis】硕士论文:ChJUa
# 【patent】专利:ChJQY
# 【cstad】成果:ChFDc
res1 = re.compile(pattern).findall(res)
m = 0
for i in res1:
if len(i) > 20:
print('一:这是第{}页的内容--------------------------------------------------------------------------------'.format(page))
m += 1
print(m)
print(i)
data = {
'Id': i
}
data = json.dumps(data)
if 'ChZDb' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
elif 'ChlQZ' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
elif 'ChJUa' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
elif 'ChJQY' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
elif 'ChFDc' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
while res1 != []:
page += 1
search_request.commonrequest.currentPage = page
bytes_body = search_request.SerializeToString()
bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
data = bytes_head+bytes_body
response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
res = response.text
pattern = r"Ch[A-Za-z0-9%]+"
res1 = re.compile(pattern).findall(res)
for i in res1:
if len(i) > 20:
print('二:这是第{}页的内容========================================================================================'.format(page))
m += 1
print(m)
print(i)
data = {
'Id': i
}
data = json.dumps(data)
if 'ChZDb' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
elif 'ChlQZ' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
elif 'ChJUa' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
elif 'ChJQY' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
elif 'ChFDc' in i:
yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
# 【需要回调的函数(Periodical_html)/】
def Periodical_html(self,response):
meta = response.meta
res = json.loads(response.text)
# print(res)
print('=============开始解析详细的详情页==========================')
url = 'http://d.wanfangdata.com.cn/periodical/' + meta['Id']
# name_1 = response.meta['name']
html = json.loads(response.text)
text_html = html['detail'][0]['periodical']
# print(text_html)
""" 作者 """
authors = ''
try:
authors_names = str(text_html['authors_name'])
except:
try:
authors = str(text_html['Creator'])
except:
authors = ''
else:
authors = authors.replace('[', '').replace(']', '').replace('\'', '').replace('"','')
else:
authors = authors_names.replace('[', '').replace(']', '').replace('\'', '')
authorunits = ''
""" 作者及作者单位 """
# 需要回调搜索函数,把作者及单位带回到参数当中
try:
authorsandunits = text_html['AuthorOrg']
except:
try:
authorsandunits = text_html['authorsandunit']
except:
authorunits = ''
else:
# for au in authorsandunits:
# print(au)
# yield scrapy.Request(url='http://d.wanfangdata.com.cn',meta={'key':au},dont_filter=True,callback=self.double_requests)
authorunits = str(authorsandunits).replace('[', '').replace(']', '').replace('\'', '')
else:
# for au in authorsandunits:
# print(au)
# yield scrapy.Request(url='http://d.wanfangdata.com.cn',meta={'key':au},dont_filter=True,callback=self.double_requests)
authorunits = str(authorsandunits).replace('[', '').replace(']', '').replace('\'', '')
""" 分类号 """
classcode4search = ''
try:
classcode4search = str(text_html['OriginalClassCode'])
except:
try:
classcode4search = str(text_html['classcode4search'])
except:
classcode4search = ''
else:
classcode4search = classcode4search.replace('[', '').replace(']', '').replace('\'', '')
else:
classcode4search = classcode4search.replace('[', '').replace(']', '').replace('\'', '')
""" 刊名 """
PeriodicalTitle= ''
try:
PeriodicalTitle = str(text_html['perio_title'])
except:
try:
PeriodicalTitle = str(text_html['PeriodicalTitle'][0])
except:
PeriodicalTitle = ''
""" 关键字 """
keywords = ''
try:
# keyword = str(text_html['keywords'])
keyword = str(text_html['Keywords'])
except:
try:
keywords = str(text_html['MachinedKeywords'])
except:
keywords = ''
else:
keywords = keywords.replace('[', '').replace(']', '').replace('\'', '').replace('"', '')
else:
keywords = keyword.replace('[', '').replace(']', '').replace('\'', '').replace('"', '')
""" 题目 """
try:
title = str(text_html['title'][0]).replace('[', '').replace(']', '').replace('\'', '').replace('<','').replace('>','').replace('/','').replace('em','')
except:
title = str(text_html['Title'][0]).replace('[', '').replace(']', '').replace('\'', '').replace('<','').replace('>','').replace('/','').replace('em','')
""" 目的 """
summary = ''
try:
summary = str(text_html['summary']).replace('[', '').replace(']', '').replace('\'', '').replace('"', '').replace('\r', '').replace('\n', '')
except:
try:
summary = str(text_html['Abstract']).replace('[', '').replace(']', '').replace('\'', '').replace('"', '').replace('\r', '').replace('\n', '')
except:
summary = ''
""" 年卷期 """
Pvi = ''
try:
# PublishYear = str(text_html['publish_year'])
PublishYear = str(text_html['PublishYear'])
except:
print('这一篇没有年的内容')
else:
Pvi = PublishYear
try:
# Issue = text_html['issue_num'].replace('"', '')
Issue = text_html['Issue'].replace('"', '')
except:
print('这一篇没有期的内容')
else:
Pvi = PublishYear + ',' + '(' + Issue + ')'
try:
# Volum = text_html['volume'].replace('"', '')
Volum = text_html['Volum'].replace('"', '')
except:
print('这一篇没有卷的内容')
else:
Pvi = PublishYear + ',' + Volum + '(' + Issue + ')'
""" 页数 """
page_cnt = ''
try:
page_cnt = str(text_html['page_cnt'])
except:
try:
page_cnt = str(text_html['PageNo'])
except:
page_cnt = ''
""" 页码 """
page_range = ''
try:
page_range = str(text_html['page_range'])
except:
try:
page_range = str(text_html['Page'])
except:
page_range = ''
print('url={}'.format(url))
print('作者={}'.format(authors))
print('作者及作者单位={}'.format(authorunits))
print('分类号={}'.format(classcode4search))
print('刊名={}'.format(PeriodicalTitle))
print('关键字={}'.format(keywords))
print('题目={}'.format(title))
print('目的={}'.format(summary))
print('页数={}'.format(page_cnt))
print('页码={}'.format(page_range))
print('年卷期={}'.format(Pvi))
path_0 = os.path.join(r'/baiyu/app/doc-3.0','periodical')
""" 创建periodical文件夹 """
if not os.path.exists(path_0):
os.mkdir(path_0)
path_01 = os.path.join(path_0,'期刊')
""" 创建期刊文件夹 """
if not os.path.exists(path_01):
os.mkdir(path_01)
path_1 = os.path.join(path_01,PeriodicalTitle)
""" 创建期刊名文件夹 """
if not os.path.exists(path_1):
os.mkdir(path_1)
path_2 = os.path.join(path_1,PublishYear)
""" 创建时间文件夹 """
if not os.path.exists(path_2):
os.mkdir(path_2)
if title:
""" 创建明细期刊文件 """
filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
filename = re.sub('[\/:*?"<>|]','-',filename)
with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
fp.write(url + '\n' +authors + '\n' +authorunits + '\n' +classcode4search + '\n' +PeriodicalTitle + '\n' +keywords + '\n' +title + '\n' +summary + '\n' +page_cnt + '\n' +page_range + '\n' +Pvi)
Periodical_item = PeriodicalItem(
url = url,
author = authors,
unit = authorunits,
classcode4search = classcode4search,
periodical = PeriodicalTitle,
keyword = keywords,
title = title,
abstract = summary,
page = page_cnt,
pagenumber = page_range,
year_issue_volume = Pvi,
)
yield Periodical_item
def Patent_html(self,response):
url = 'http://d.wanfangdata.com.cn/patent/' + response.meta['Id']
# name_1 = response.meta['name']
html = json.loads(response.text)
text_html = html['detail'][0]['patent']
# print(text_html)
""" 专利名称 """
title = str(text_html['Title']).replace('[','').replace(']','').replace('\'','')
""" 摘要 """
abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','')
""" 专利类型 """
PatentType = text_html['PatentType']
""" 申请/专利号 """
PatentCode = text_html['PatentCode']
""" 申请日期 """
ApplicationDate = text_html['ApplicationDate']
""" 公开/公告号 """
PublicationNo = text_html['PublicationNo']
""" 公开/公告日 """
PublicationDate = text_html['PublicationDate']
""" 申请/专利权人 """
Applicant = str(text_html['Applicant']).replace('[','').replace(']','').replace('\'','')
""" 发明/设计人 """
Inventor = str(text_html['Inventor']).replace('[','').replace(']','').replace('\'','')
""" 主权项 """
SignoryItem = text_html['SignoryItem']
print(url)
print(title)
print(abstract)
print(PatentType)
print(PatentCode)
print(ApplicationDate)
print(PublicationNo)
print(PublicationDate)
print(Applicant)
print(Inventor)
print(SignoryItem)
path_0 = os.path.join(r'/baiyu/app/doc-2.0','thesis')
""" 创建thesis文件夹 """
if not os.path.exists(path_0):
os.mkdir(path_0)
path_1 = os.path.join(path_0,'学位')
""" 创建期刊文件夹 """
if not os.path.exists(path_1):
os.mkdir(path_1)
ti = str(publishyear)
path_2 = os.path.join(path_1,ti)
""" 创建时间文件夹 """
if not os.path.exists(path_2):
os.mkdir(path_2)
if title:
""" 创建明细期刊文件 """
filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
filename = re.sub('[\/:*?"<>|]','-',filename)
filename = filename.replace('[','').replace(']','').replace('\'','')
with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +major + '\n' +tutor + '\n' +publishyear + '\n' +degree)
Patent_item = PatentItem(
url = url,
title = title,
abstract = abstract,
patentType = PatentType,
patentCode = PatentCode,
applicationDate = ApplicationDate,
publicationNo = PublicationNo,
publicationDate = PublicationDate,
applicant = Applicant,
inventor = Inventor,
signoryItem = SignoryItem
)
yield Patent_item
def Cstad_html(self,response):
meta = response.meta
new_html = json.loads(response.text)
html_data = new_html['detail'][0]['Cstad']
url = 'http://d.wanfangdata.com.cn/cstad/' + meta['Id']
#题目
title = html_data['Title'][0]
#项目年度编号
Cstad_id = str(html_data['Id'])
#成果类别
achievementType = html_data['AchievementType']
#成果公布年份
publishYear = str(html_data['PublishYear'])
#完成单位
organizations = str(html_data['Organization']).replace('[','').replace(']','').replace('\"','').replace('\'','')
#应用行业名称
tradeName = str(html_data['TradeName']).replace('[','').replace(']','').replace('\"','').replace('\'','')
#应用行业码
tradeCode = str(html_data['TradeCode']).replace('[','').replace(']','').replace('\"','').replace('\'','')
#联系单位名称
contactUnit = html_data['ContactUnit']
#联系单位地址
contactAddress = html_data['ContactAddress']
#成果简介
abstract = str(html_data['Abstract']).replace('[','').replace(']','').replace('\"','').replace('\'','')
#关键词
keyword = str(html_data['Keywords']).replace('[','').replace(']','').replace('\"','').replace('\'','')
#作者
author = str(html_data['Creator']).replace('[','').replace(']','').replace('\"','').replace('\'','')
print(url)
print(title)
print(Cstad_id)
print(achievementType)
print(publishYear)
print(organizations)
print(tradeName)
print(tradeCode)
print(contactUnit)
print(contactAddress)
print(abstract)
print(keyword)
print(author)
path_0 = os.path.join(r'/baiyu/app/doc-2.0','cstad')
""" 创建cstad文件夹 """
if not os.path.exists(path_0):
os.mkdir(path_0)
path_1 = os.path.join(path_0,'成果')
""" 创建成果文件夹 """
if not os.path.exists(path_1):
os.mkdir(path_1)
path_2 = os.path.join(path_1,publishYear)
""" 创建时间文件夹 """
if not os.path.exists(path_2):
os.mkdir(path_2)
if title:
""" 创建明细期刊文件 """
filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
filename = re.sub('[\/:*?"<>|]','-',filename)
with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
fp.write(url + '\n' +title + '\n' +Cstad_id + '\n' +achievementType + '\n' +publishYear + '\n' +organizations + '\n' +tradeName + '\n' +tradeCode + '\n' +contactUnit + '\n' + contactAddress+ '\n' + abstract+ '\n' + keyword+ '\n' + author)
Cstad_item = CstadItem(
url = url,
title = title,
Cstad_id = Cstad_id,
achievementType = achievementType,
publishYear = publishYear,
organizations = organizations,
tradeName = tradeName,
tradeCode = tradeCode,
contactUnit = contactUnit,
contactAddress = contactAddress,
abstract = abstract,
keyword = keyword,
author = author,
)
yield Cstad_item
def Conference_html(self,response):
url = 'http://d.wanfangdata.com.cn/conference/' + response.meta['Id']
html = json.loads(response.text)
text_html = html['detail'][0]['conference']
""" 学位论文题目 """
title = text_html['Title'][0]
""" 摘要 """
abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','').replace('\\u3000\\u3000','')
""" 作者 """
author = str(text_html['Creator']).replace('[','').replace(']','').replace('\'','')
""" 机标分类号 """
classcode = str(text_html['MachinedClassCode']).replace('[','').replace(']','').replace('\'','')
if classcode == '':
classcode = str(text_html['ClassCode']).replace('[','').replace(']','').replace('\'','')
""" 论文关键字 """
keywords = str(text_html['Keywords']).replace('[','').replace(']','').replace('\'','')
if keywords == '':
keywords = str(text_html['MachinedKeywords']).replace('[','').replace(']','').replace('\'','')
""" 作者及单位 """
authororg = str(text_html['AuthorOrg']).replace('[','').replace(']','').replace('\'','')
""" 会议时间 """
publisdate = str(text_html['MeetingDate'])
""" 会议地点 """
publishyear = str(text_html['MeetingArea'])
""" 页码 """
page = str(text_html['Page'])
print(url)
print(title)
print(abstract)
print(author)
print(classcode)
print(keywords)
print(authororg)
print(publisdate)
print(publishyear)
print(page)
path_0 = os.path.join(r'/baiyu/app/doc-2.0','conference')
""" 创建conference文件夹 """
if not os.path.exists(path_0):
os.mkdir(path_0)
path_1 = os.path.join(path_0,'会议')
""" 创建期刊文件夹 """
if not os.path.exists(path_1):
os.mkdir(path_1)
ti = str(publisdate)
path_2 = os.path.join(path_1,ti)
""" 创建时间文件夹 """
if not os.path.exists(path_2):
os.mkdir(path_2)
if title:
""" 创建明细期刊文件 """
filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
filename = re.sub('[\/:*?"<>|]','-',filename)
with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +publisdate + '\n' +publishyear + '\n' +page)
Conference_item = ConferenceItem(
url = url,
title = title,
abstract = abstract,
author = author,
classcode = classcode,
keywords = keywords,
authororg = authororg,
publisdate = publisdate,
publishyear = publishyear,
page = page,
)
yield Conference_item
def Thesis_html(self,response):
url = 'http://d.wanfangdata.com.cn/thesis/' + response.meta['Id']
# name_1 = response.meta['name']
html = json.loads(response.text)
text_html = html['detail'][0]['thesis']
# print(text_html)
""" 学位论文题目 """
title = text_html['Title'][0]
""" 论文简介 """
abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','')
""" 论文作者 """
author = str(text_html['Creator']).replace('[','').replace(']','').replace('\'','')
""" 论文分类号 """
classcode = str(text_html['MachinedClassCode']).replace('[','').replace(']','').replace('\'','')
if classcode == '':
classcode = str(text_html['ClassCode']).replace('[','').replace(']','').replace('\'','')
""" 论文关键字 """
keywords = str(text_html['Keywords']).replace('[','').replace(']','').replace('\'','')
if keywords == '':
keywords = str(text_html['MachinedKeywords']).replace('[','').replace(']','').replace('\'','')
""" 作者及单位 """
authororg = str(text_html['AuthorOrg']).replace('[','').replace(']','').replace('\'','')
""" 学科专业 """
major = text_html['Major']
""" 导师姓名 """
tutor = str(text_html['Tutor']).replace('[','').replace(']','').replace('\'','')
""" 学位年度 """
publishyear = str(text_html['PublishYear'])
""" 授予学位 """
degree = text_html['Degree']
print(url)
print(title)
print(abstract)
print(author)
print(classcode)
print(keywords)
print(authororg)
print(major)
print(tutor)
print(publishyear)
print(degree)
path_0 = os.path.join(r'/baiyu/app/doc-2.0','thesis')
""" 创建thesis文件夹 """
if not os.path.exists(path_0):
os.mkdir(path_0)
path_1 = os.path.join(path_0,'学位')
""" 创建期刊文件夹 """
if not os.path.exists(path_1):
os.mkdir(path_1)
ti = str(publishyear)
path_2 = os.path.join(path_1,ti)
""" 创建时间文件夹 """
if not os.path.exists(path_2):
os.mkdir(path_2)
if title:
""" 创建明细期刊文件 """
filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
filename = re.sub('[\/:*?"<>|]','-',filename)
filename = filename.replace('[','').replace(']','').replace('\'','')
with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +major + '\n' +tutor + '\n' +publishyear + '\n' +degree)
Thesis_item = ThesisItem(
url = url,
title = title,
abstract = abstract,
author = author,
classcode = classcode,
keywords = keywords,
authororg = authororg,
major = major,
tutor = tutor,
publishyear = publishyear,
degree = degree
)
yield Thesis_item