对于万方数据的信息获取

import scrapy
from urllib import parse
import os
import requests

import re
import json

from ..items import PeriodicalItem
from ..items import PatentItem
from ..items import CstadItem
from ..items import ConferenceItem
from ..items import ThesisItem
from .. import test_pb2 as pb

import pandas as pd
import numpy as np

class DoctorSpider(scrapy.Spider):   
    name = 'doctor'
    allowed_domains = ['wanfangdata.com.cn']

    def start_requests(self):
        new = pd.read_csv('12.csv', engine='python', encoding='utf-8')
        new.drop(labels=new.columns[0],axis=1,inplace=True)
        new_ = np.array(new)
        new_lists = new_.tolist()
        # print(new_lists)
        for i in new_lists:
            key = str(i).replace('[','').replace(']','').replace('\'','').replace(' ','').replace(',',' ')
            print(key)
            page = 1
            search_request = pb.SearchService.SearchRequest()
            search_request.commonrequest.searchType = "paper"
            search_request.commonrequest.searchWord = key
            search_request.commonrequest.currentPage = page
            search_request.commonrequest.pageSize = 50
            search_request.commonrequest.searchFilter.append(0)
            search_request.interfaceType = 1


            cookies = {
                'wfpub_token': '26982e2d-03b6-4326-a612-260e93fbe56a',
                'rememberpwd_aabbccqwe': 'QiwpuOQe8O7wtHfxwoUN^%^2BfXGJpfbOhqdCyFqrOhAzE4zqSFmpS0S87TDWAZdW8l60ZOrMerys9xI^%^0A22Xb4pVjXCJJqB^%^2BRhogO',
                'zh_choose': 'n',
                'firstvisit_backurl': 'http^%^3A//www.wanfangdata.com.cn',
                'Hm_lvt_838fbc4154ad87515435bf1e10023fab': '1609314875,1609377029,1609403937,1609722037',
                'token': 'TGT-8577286-L0bG0ecg6MRkafGuCpfubZsPo3JAMt3INq5JFfDovcRidS3kGJ-my.wanfangdata.com.cn',
                'sync_login_wfpub_token': '0179f7fd-7a9d-4b6a-a125-755370240c6c',
                'noReadCollection': '0',
                'WFKS.Auth': '^%^7B^%^22Context^%^22^%^3A^%^7B^%^22AccountIds^%^22^%^3A^%^5B^%^22Group.hnszyxy^%^22^%^2C^%^22GTimeLimit.hnszyxy^%^22^%^5D^%^2C^%^22Data^%^22^%^3A^%^5B^%^7B^%^22Key^%^22^%^3A^%^22Group.hnszyxy.DisplayName^%^22^%^2C^%^22Value^%^22^%^3A^%^22^%^E6^%^B2^%^B3^%^E5^%^8D^%^97^%^E4^%^B8^%^AD^%^E5^%^8C^%^BB^%^E8^%^8D^%^AF^%^E5^%^A4^%^A7^%^E5^%^AD^%^A6^%^22^%^7D^%^5D^%^2C^%^22SessionId^%^22^%^3A^%^22ce94c89a-f806-487c-aeeb-9d0b270c1bd8^%^22^%^2C^%^22Sign^%^22^%^3A^%^22qKfj844t^%^5C^%^2FWqgLJmLugRk1t5kl7iGuK5^%^5C^%^2F^%^5C^%^2FMn6m4mGwR2d74bDih4^%^5C^%^2F^%^2B1AK1AO0uhJ2^%^22^%^7D^%^2C^%^22LastUpdate^%^22^%^3A^%^222021-01-04T07^%^3A44^%^3A50Z^%^22^%^2C^%^22TicketSign^%^22^%^3A^%^22HsitxZoK82RjKW6xdUMZSQ^%^3D^%^3D^%^22^%^2C^%^22UserIp^%^22^%^3Anull^%^7D',
                'SEARCHHISTORY_0': 'UEsDBBQACAgIAJl9JFIAAAAAAAAAAAAAAAABAAAAMO2d^%^2FW^%^2FbxhnH^%^2F5VCAIMWSxPyeEfeGTAKUS^%^2BO^%^0AZfklfreX^%^2FUBLtCVbbyYpK3JRwNualzqLOy^%^2FNmjSOG7RZjWBJ0KzB6rRJ^%^2B8fMlKz^%^2FYneUEr^%^2BJbJQt^%^0AikwTkKUj73iS7^%^2Fvh6bnnnjv9^%^2FuOAqc5ktAE1qwW6csVM5nQgnQx0Babmx8oYgiJGYuB0oGhoem^%^2Fy^%^0AZQFDU^%^2FVEarRcoJcINFPP0AtSplnoOnvWOFNSc7Nqbi6pmuqZRD57JpE7W1ALmv7RYjcXgZyCuWCY^%^0AiyCOKJwSYQmMOSxxEZlTBE5R7DNRTpE5MTj8IQ^%^2Fou2sXC7pmGOl8jr7PzvYj684v1tXL1Sc^%^2FW5^%^2F^%^2F^%^0A2NUoY9qfJlDZ^%^2BNq6uvqflT9aj77beXGdJnaef7X7^%^2BDFLbN^%^2Bwrj6gierWemV1pbL6D2t9de^%^2Fw6l8r^%^0Amxv0sHLvinXlMkv8cHN361NW1cb3lb8^%^2Fs379kqZ3ty7XvllnJ^%^2B^%^2F8svPsO^%^2FrGuWI2mi^%^2FmaOOIvCSd^%^0ADiR0TTW10TRrT0HiiQyRJAm0GT857dTWsTwZHB5V^%^2B0UitdTWpVKpWWvXL2m8xNOGeSaZ^%^2F2ivnm41^%^0AkzllpPIl^%^2B^%^2BBUQZ3TRtLLNFUvMpHXk91MBKoSIVxE4jBNyxxP04TDIkcElkv1IbZ6Qchy6^%^2BWDtozB^%^0AEHvUpcY8u4rWg6F9ldC4Codsqe2sIDiVNkb19Nycpo^%^2Bqc92HFbce3a5c2rpg1v68bv24ad3fotpW^%^0AHz7c2V6p3f7UerJp^%^2FeXn2u1vOoQBQTyEACIEIUwkQtV1RmBRDEV6lktIEIiPgFcR4Km6zghEwmkd^%^0A5aY0AQAfAY8igCSqrjMCA4n^%^2BqfJwIglEHwHPIiBSdZ0RODcQPV8azoyI2P8i8CwCvEh4ZwTkeSVu^%^0A9oN^%^2BUfYJ8CgBUOZF7NIJ9IRJTETTiEc^%^2BAl5FAFJ1nRGIKhMLC^%^2F3zZSRDHwGvIsBTdZ0RkJSlhfLI^%^0AUhLLvlvAqwiIMlXXGYHcRDEvhpUlAngfAY8iAAhV1xmBopgwwjBUlEQfAY8igIlA1XVGYFyZgblx^%^0AFQvtGxDsV5tqRdWmotmKYdTFAWBrKttn97n0KQwKpLm^%^2FO3qRnU3PoEYWDtereVvUcCD6ppXbCaVe^%^0AeZDDwX2FX1V^%^2BdFrixcbuyqWuCwFra4NNUVx6eiHwXv2kdf0mJY1muTN69mj2zvZT66dVekizLwQ6^%^0AA2KBPzzJ0YBYEnhBdoY4pScGYvL0hIwEvx^%^2FzZj8GMaDqOiPAx2PTOKaVoOx7NryKgISous4IlEEW^%^0A9OHBaeiPaTyKgCxLPHQb0wyP9I5H^%^2BWS^%^2FLGAfAU8iQAEQqbrOCGQjojkaHcDvdqoT8AcheDd6Hwed^%^0AITXuUDOlRYB51xnNTFwn45OCJr5LR7Yv9P8qNKQPwIsu^%^2FurseDIb7ycikluLF3tDpU0tkdqTOqmZ^%^0AajrjLPgx7t73K31AKr65TAKmCrjEGZV7^%^2BsdyE^%^2Bdp8bbJ5CDL^%^2B29Pgg84gABPn2gOQDhMnwhL0ToA^%^0AUsIvz4lBdk5i54RXRdhT0C4HDxzSD^%^2FXykF0blI7glzbOpc3u^%^2BstYjiVn07phBotmKq93z6oZQzuV^%^0AyOvakKan841jNUM1TJaVYnlYM^%^2FJFPaE1MnTa0tqQqlMB7W4tpy6N7LXrYared8fpg^%^2FcaPojKk5vW^%^0A^%^2Faf1Yq^%^2FBmNCUMVGGNOXMGER98dJiLyFt6Qle212F6c1PbLAiHJZtVmg5sTV31aH41fA^%^2B9vYTKzeo^%^0APuQ2YrUDTomyMgQz1FlhniP1BOEUye6jCPsQbj6my59Zj6^%^2FU7t446mOyw2NfUNkZCFvrdd^%^2FR7p9^%^2B^%^0Ard66X733RfXBvyufrXSKK0luZj1SviTME5c^%^2BrBQ1h^%^2FSpixMAtC9E5nUAU0JMSyY21Zg^%^2Fllw9v1f7^%^0A2^%^2FfHGyqx^%^2BRejiDDlxRmq3uXEmKJLMdRGx8RvQyVzmHZWChOP6krTLJtqfyx7reratcqttd3r3x5v^%^0AwAS^%^2BOWGAQOTm9xAWxssTBhombRnz^%^2Bgayg0oC4YnLgFUcOY9Eido4^%^2FiyFJ51TkI6PCC^%^2B7EACl9Fxh^%^0AIT4J^%^2BLYg4H6jHifHRYvDCABk2sTOOqSjofnJyLkYLezr8BZ1EOkfcfHspBK9hcVBpSRLbbGMTq4O^%^0AkihLLlNncaM4szgw1Ct0gIPNyzLIAi^%^2B4jD5lyQj1xuQ4ltoyiX2CdZCx2^%^2B0wHe2RexOzWSi2fxrR^%^0AXQbfXvt^%^2F2msEAQDdFktE1bCC1KUZAtrnb^%^2FTnmN7KHBNGVERnpeeEeD6fmpzyQ0c8easjJEPI824u^%^0AFD4VmSpML^%^2Frh8F5FQJJ45BoOv7woJUYLyznUngnMVv20tv81WHfY2vuYnOxg6OraNev6Pys3P^%^2Fdq^%^0AMLTcPAoSichthjR7fnR2fmBRQEL7wmJ8iH2IW4MY85RPl1CS^%^2FnjvQMGIIb59QXw^%^2BxD7ELUGMEKB8^%^0AuvTEsFCYWYpkxDYOHn2IfYhbghhjtmmMM8QhZa7PHB6VBdJJNvGrgZFNswJOOLt0cFZd^%^2B5dHwQWC^%^0Aw6JAAimULitbo^%^2BXcxNQSFPhOItfvfv3u98AmDVAQXFwSqGcyFl0aNJHUSYawxMJbWcQYZFKTIFM4^%^0AKDJSTjbElbtrOz99Vft286RBTJDkNq0Jh5Z7VDCC2zid5tsQvg3x2zYEgQC7zf^%^2FJc1CJlIfjhPjb^%^0AYxwDhE^%^2Fo9hgE0leXcIbY1JAKy5GLCHWSEVEX3V4so4Rsa4KwJTNsBdSJpnhn^%^2B0rt^%^2BVrtxiOvUixi^%^0AB4qZR80lVlBCkZF0cRCJnbSKgrBFCkx92yZudWmO5^%^2BCtPXhYudtklY8nyMXAYc9dgTLpDC4Zi2TJ^%^0AspCX27jt8us4IghkS25YbAHP2GUERzgin3CCq1^%^2FcqqzcrWx^%^2B6VGIkRPEkPLpDHG4L1UYiaJZCNu3^%^0AZ2wL3jTMdFZaXLrmPXbXru2u^%^2FuBRcAXSdJsQSi6G0GUueSBh9qGRmXkEO6n7lezVksTODnEk6hsQ^%^0ADRfanWc7z7^%^2F2rg0hOnS^%^2FElsq4wyxNjFtakZPArS4sVXTn8wyU5qRNt7sN7OEI8od^%^2Fc0sYa^%^2Bp6y18^%^0A8AvocEy^%^2B^%^2FUNWEiA8^%^2Fec^%^2B^%^2BcN^%^2FAVBLBwg6zwtu3AgAADZsAAA^%^3D^%^0A',
                'CASTGC': 'TGT-8918879-zwRiVbYXCSHisu3Y9vYFFZ1zFV4kS1pldjhTjmVfIaoB0M6BBq-my.wanfangdata.com.cn',
                'Hm_lpvt_838fbc4154ad87515435bf1e10023fab': '1609746447',
            }

            headers = {
                'Proxy-Connection': 'keep-alive',
                'Pragma': 'no-cache',
                'Cache-Control': 'no-cache',
                'X-User-Agent': 'grpc-web-javascript/0.1',
                'X-Grpc-Web': '1',
                'Content-Type': 'application/grpc-web+proto',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'Accept': '*/*',
                'Origin': 'http://s.wanfangdata.com.cn',
                'Referer': 'http://s.wanfangdata.com.cn/thesis?q=^%^E4^%^B8^%^AD^%^E5^%^9B^%^BE^%^E5^%^88^%^86^%^E7^%^B1^%^BB^%^E5^%^8F^%^B7^%^3AR-01',
                'Accept-Language': 'zh-CN,zh;q=0.9',
            }
            bytes_body = search_request.SerializeToString()
            bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
            data = bytes_head+bytes_body
            response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)

            res = response.text
            # print(res)
            pattern = r"Ch[A-Za-z0-9%]+"

            # 【conference】会议论文:ChZDb
            # 【periodical】期刊论文:ChlQZ
            # 【thesis】硕士论文:ChJUa
            # 【patent】专利:ChJQY
            # 【cstad】成果:ChFDc

            res1 = re.compile(pattern).findall(res)
            m = 0
            for i in res1:
                if len(i) > 20:
                    # print('一:这是第{}页的内容--------------------------------------------------------------------------------'.format(page))
                    m += 1
                    # print(m)
                    # print(i)
                    data = {
                        'Id': i
                    }
                    data = json.dumps(data)
                    if 'ChZDb' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
                    elif 'ChlQZ' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
                    elif 'ChJUa' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
                    elif 'ChJQY' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
                    elif 'ChFDc' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
                    

            while res1 != []:
                page += 1
                search_request.commonrequest.currentPage = page
                bytes_body = search_request.SerializeToString()
                bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
                data = bytes_head+bytes_body
                response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
                res = response.text
                pattern = r"Ch[A-Za-z0-9%]+"

                res1 = re.compile(pattern).findall(res)
                for i in res1:
                    if len(i) > 20:
                        # print('二:这是第{}页的内容========================================================================================'.format(page))
                        m += 1
                        # print(m)
                        # print(i)
                        data = {
                            'Id': i
                        }
                        data = json.dumps(data)
                        if 'ChZDb' in i:
                            yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
                        elif 'ChlQZ' in i:
                            yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
                        elif 'ChJUa' in i:
                            yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
                        elif 'ChJQY' in i:
                            yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
                        elif 'ChFDc' in i:
                            yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)

    def double_requests(self,response):
        print('·································································这是第二部分循环引用下作者和单位获取的详情信息·······················································································')
        key = response.meta['key']
        print(key)
        page = 1
        search_request = pb.SearchService.SearchRequest()
        search_request.commonrequest.searchType = "paper"
        search_request.commonrequest.searchWord = key
        search_request.commonrequest.currentPage = page
        search_request.commonrequest.pageSize = 50
        search_request.commonrequest.searchFilter.append(0)
        search_request.interfaceType = 1


        cookies = {
            'wfpub_token': '26982e2d-03b6-4326-a612-260e93fbe56a',
            'rememberpwd_aabbccqwe': 'QiwpuOQe8O7wtHfxwoUN^%^2BfXGJpfbOhqdCyFqrOhAzE4zqSFmpS0S87TDWAZdW8l60ZOrMerys9xI^%^0A22Xb4pVjXCJJqB^%^2BRhogO',
            'zh_choose': 'n',
            'firstvisit_backurl': 'http^%^3A//www.wanfangdata.com.cn',
            'Hm_lvt_838fbc4154ad87515435bf1e10023fab': '1609314875,1609377029,1609403937,1609722037',
            'token': 'TGT-8577286-L0bG0ecg6MRkafGuCpfubZsPo3JAMt3INq5JFfDovcRidS3kGJ-my.wanfangdata.com.cn',
            'sync_login_wfpub_token': '0179f7fd-7a9d-4b6a-a125-755370240c6c',
            'noReadCollection': '0',
            'WFKS.Auth': '^%^7B^%^22Context^%^22^%^3A^%^7B^%^22AccountIds^%^22^%^3A^%^5B^%^22Group.hnszyxy^%^22^%^2C^%^22GTimeLimit.hnszyxy^%^22^%^5D^%^2C^%^22Data^%^22^%^3A^%^5B^%^7B^%^22Key^%^22^%^3A^%^22Group.hnszyxy.DisplayName^%^22^%^2C^%^22Value^%^22^%^3A^%^22^%^E6^%^B2^%^B3^%^E5^%^8D^%^97^%^E4^%^B8^%^AD^%^E5^%^8C^%^BB^%^E8^%^8D^%^AF^%^E5^%^A4^%^A7^%^E5^%^AD^%^A6^%^22^%^7D^%^5D^%^2C^%^22SessionId^%^22^%^3A^%^22ce94c89a-f806-487c-aeeb-9d0b270c1bd8^%^22^%^2C^%^22Sign^%^22^%^3A^%^22qKfj844t^%^5C^%^2FWqgLJmLugRk1t5kl7iGuK5^%^5C^%^2F^%^5C^%^2FMn6m4mGwR2d74bDih4^%^5C^%^2F^%^2B1AK1AO0uhJ2^%^22^%^7D^%^2C^%^22LastUpdate^%^22^%^3A^%^222021-01-04T07^%^3A44^%^3A50Z^%^22^%^2C^%^22TicketSign^%^22^%^3A^%^22HsitxZoK82RjKW6xdUMZSQ^%^3D^%^3D^%^22^%^2C^%^22UserIp^%^22^%^3Anull^%^7D',
            'SEARCHHISTORY_0': 'UEsDBBQACAgIAJl9JFIAAAAAAAAAAAAAAAABAAAAMO2d^%^2FW^%^2FbxhnH^%^2F5VCAIMWSxPyeEfeGTAKUS^%^2BO^%^0AZfklfreX^%^2FUBLtCVbbyYpK3JRwNualzqLOy^%^2FNmjSOG7RZjWBJ0KzB6rRJ^%^2B8fMlKz^%^2FYneUEr^%^2BJbJQt^%^0AikwTkKUj73iS7^%^2Fvh6bnnnjv9^%^2FuOAqc5ktAE1qwW6csVM5nQgnQx0Babmx8oYgiJGYuB0oGhoem^%^2Fy^%^0AZQFDU^%^2FVEarRcoJcINFPP0AtSplnoOnvWOFNSc7Nqbi6pmuqZRD57JpE7W1ALmv7RYjcXgZyCuWCY^%^0AiyCOKJwSYQmMOSxxEZlTBE5R7DNRTpE5MTj8IQ^%^2Fou2sXC7pmGOl8jr7PzvYj684v1tXL1Sc^%^2FW5^%^2F^%^2F^%^0A2NUoY9qfJlDZ^%^2BNq6uvqflT9aj77beXGdJnaef7X7^%^2BDFLbN^%^2Bwrj6gierWemV1pbL6D2t9de^%^2Fw6l8r^%^0Amxv0sHLvinXlMkv8cHN361NW1cb3lb8^%^2Fs379kqZ3ty7XvllnJ^%^2B^%^2F8svPsO^%^2FrGuWI2mi^%^2FmaOOIvCSd^%^0ADiR0TTW10TRrT0HiiQyRJAm0GT857dTWsTwZHB5V^%^2B0UitdTWpVKpWWvXL2m8xNOGeSaZ^%^2F2ivnm41^%^0AkzllpPIl^%^2B^%^2BBUQZ3TRtLLNFUvMpHXk91MBKoSIVxE4jBNyxxP04TDIkcElkv1IbZ6Qchy6^%^2BWDtozB^%^0AEHvUpcY8u4rWg6F9ldC4Codsqe2sIDiVNkb19Nycpo^%^2Bqc92HFbce3a5c2rpg1v68bv24ad3fotpW^%^0AHz7c2V6p3f7UerJp^%^2FeXn2u1vOoQBQTyEACIEIUwkQtV1RmBRDEV6lktIEIiPgFcR4Km6zghEwmkd^%^0A5aY0AQAfAY8igCSqrjMCA4n^%^2BqfJwIglEHwHPIiBSdZ0RODcQPV8azoyI2P8i8CwCvEh4ZwTkeSVu^%^0A9oN^%^2BUfYJ8CgBUOZF7NIJ9IRJTETTiEc^%^2BAl5FAFJ1nRGIKhMLC^%^2F3zZSRDHwGvIsBTdZ0RkJSlhfLI^%^0AUhLLvlvAqwiIMlXXGYHcRDEvhpUlAngfAY8iAAhV1xmBopgwwjBUlEQfAY8igIlA1XVGYFyZgblx^%^0AFQvtGxDsV5tqRdWmotmKYdTFAWBrKttn97n0KQwKpLm^%^2FO3qRnU3PoEYWDtereVvUcCD6ppXbCaVe^%^0AeZDDwX2FX1V^%^2BdFrixcbuyqWuCwFra4NNUVx6eiHwXv2kdf0mJY1muTN69mj2zvZT66dVekizLwQ6^%^0AA2KBPzzJ0YBYEnhBdoY4pScGYvL0hIwEvx^%^2FzZj8GMaDqOiPAx2PTOKaVoOx7NryKgISous4IlEEW^%^0A9OHBaeiPaTyKgCxLPHQb0wyP9I5H^%^2BWS^%^2FLGAfAU8iQAEQqbrOCGQjojkaHcDvdqoT8AcheDd6Hwed^%^0AITXuUDOlRYB51xnNTFwn45OCJr5LR7Yv9P8qNKQPwIsu^%^2FurseDIb7ycikluLF3tDpU0tkdqTOqmZ^%^0AajrjLPgx7t73K31AKr65TAKmCrjEGZV7^%^2BsdyE^%^2Bdp8bbJ5CDL^%^2B29Pgg84gABPn2gOQDhMnwhL0ToA^%^0AUsIvz4lBdk5i54RXRdhT0C4HDxzSD^%^2FXykF0blI7glzbOpc3u^%^2BstYjiVn07phBotmKq93z6oZQzuV^%^0AyOvakKan841jNUM1TJaVYnlYM^%^2FJFPaE1MnTa0tqQqlMB7W4tpy6N7LXrYared8fpg^%^2FcaPojKk5vW^%^0A^%^2Faf1Yq^%^2FBmNCUMVGGNOXMGER98dJiLyFt6Qle212F6c1PbLAiHJZtVmg5sTV31aH41fA^%^2B9vYTKzeo^%^0APuQ2YrUDTomyMgQz1FlhniP1BOEUye6jCPsQbj6my59Zj6^%^2FU7t446mOyw2NfUNkZCFvrdd^%^2FR7p9^%^2B^%^0Ard66X733RfXBvyufrXSKK0luZj1SviTME5c^%^2BrBQ1h^%^2FSpixMAtC9E5nUAU0JMSyY21Zg^%^2Fllw9v1f7^%^0A2^%^2FfHGyqx^%^2BRejiDDlxRmq3uXEmKJLMdRGx8RvQyVzmHZWChOP6krTLJtqfyx7reratcqttd3r3x5v^%^0AwAS^%^2BOWGAQOTm9xAWxssTBhombRnz^%^2Bgayg0oC4YnLgFUcOY9Eido4^%^2FiyFJ51TkI6PCC^%^2B7EACl9Fxh^%^0AIT4J^%^2BLYg4H6jHifHRYvDCABk2sTOOqSjofnJyLkYLezr8BZ1EOkfcfHspBK9hcVBpSRLbbGMTq4O^%^0AkihLLlNncaM4szgw1Ct0gIPNyzLIAi^%^2B4jD5lyQj1xuQ4ltoyiX2CdZCx2^%^2B0wHe2RexOzWSi2fxrR^%^0AXQbfXvt^%^2F2msEAQDdFktE1bCC1KUZAtrnb^%^2FTnmN7KHBNGVERnpeeEeD6fmpzyQ0c8easjJEPI824u^%^0AFD4VmSpML^%^2Frh8F5FQJJ45BoOv7woJUYLyznUngnMVv20tv81WHfY2vuYnOxg6OraNev6Pys3P^%^2Fdq^%^0AMLTcPAoSichthjR7fnR2fmBRQEL7wmJ8iH2IW4MY85RPl1CS^%^2FnjvQMGIIb59QXw^%^2BxD7ELUGMEKB8^%^0AuvTEsFCYWYpkxDYOHn2IfYhbghhjtmmMM8QhZa7PHB6VBdJJNvGrgZFNswJOOLt0cFZd^%^2B5dHwQWC^%^0Aw6JAAimULitbo^%^2BXcxNQSFPhOItfvfv3u98AmDVAQXFwSqGcyFl0aNJHUSYawxMJbWcQYZFKTIFM4^%^0AKDJSTjbElbtrOz99Vft286RBTJDkNq0Jh5Z7VDCC2zid5tsQvg3x2zYEgQC7zf^%^2FJc1CJlIfjhPjb^%^0AYxwDhE^%^2Fo9hgE0leXcIbY1JAKy5GLCHWSEVEX3V4so4Rsa4KwJTNsBdSJpnhn^%^2B0rt^%^2BVrtxiOvUixi^%^0AB4qZR80lVlBCkZF0cRCJnbSKgrBFCkx92yZudWmO5^%^2BCtPXhYudtklY8nyMXAYc9dgTLpDC4Zi2TJ^%^0AspCX27jt8us4IghkS25YbAHP2GUERzgin3CCq1^%^2FcqqzcrWx^%^2B6VGIkRPEkPLpDHG4L1UYiaJZCNu3^%^0AZ2wL3jTMdFZaXLrmPXbXru2u^%^2FuBRcAXSdJsQSi6G0GUueSBh9qGRmXkEO6n7lezVksTODnEk6hsQ^%^0ADRfanWc7z7^%^2F2rg0hOnS^%^2FElsq4wyxNjFtakZPArS4sVXTn8wyU5qRNt7sN7OEI8od^%^2Fc0sYa^%^2Bp6y18^%^0A8AvocEy^%^2B^%^2FUNWEiA8^%^2Fec^%^2B^%^2BcN^%^2FAVBLBwg6zwtu3AgAADZsAAA^%^3D^%^0A',
            'CASTGC': 'TGT-8918879-zwRiVbYXCSHisu3Y9vYFFZ1zFV4kS1pldjhTjmVfIaoB0M6BBq-my.wanfangdata.com.cn',
            'Hm_lpvt_838fbc4154ad87515435bf1e10023fab': '1609746447',
        }

        headers = {
            'Proxy-Connection': 'keep-alive',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache',
            'X-User-Agent': 'grpc-web-javascript/0.1',
            'X-Grpc-Web': '1',
            'Content-Type': 'application/grpc-web+proto',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
            'Accept': '*/*',
            'Origin': 'http://s.wanfangdata.com.cn',
            'Referer': 'http://s.wanfangdata.com.cn/thesis?q=^%^E4^%^B8^%^AD^%^E5^%^9B^%^BE^%^E5^%^88^%^86^%^E7^%^B1^%^BB^%^E5^%^8F^%^B7^%^3AR-01',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
        bytes_body = search_request.SerializeToString()
        bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
        data = bytes_head+bytes_body
        response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)

        res = response.text
        # print(res)
        pattern = r"Ch[A-Za-z0-9%]+"

        # 【conference】会议论文:ChZDb
        # 【periodical】期刊论文:ChlQZ
        # 【thesis】硕士论文:ChJUa
        # 【patent】专利:ChJQY
        # 【cstad】成果:ChFDc

        res1 = re.compile(pattern).findall(res)
        m = 0
        for i in res1:
            if len(i) > 20:
                print('一:这是第{}页的内容--------------------------------------------------------------------------------'.format(page))
                m += 1
                print(m)
                print(i)
                data = {
                    'Id': i
                }
                data = json.dumps(data)
                if 'ChZDb' in i:
                    yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
                elif 'ChlQZ' in i:
                    yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
                elif 'ChJUa' in i:
                    yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
                elif 'ChJQY' in i:
                    yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
                elif 'ChFDc' in i:
                    yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)
                

        while res1 != []:
            page += 1
            search_request.commonrequest.currentPage = page
            bytes_body = search_request.SerializeToString()
            bytes_head = bytes([0, 0, 0, 0, len(bytes_body)])
            data = bytes_head+bytes_body
            response = requests.post('http://s.wanfangdata.com.cn/SearchService.SearchService/search', headers=headers, cookies=cookies, data=data)
            res = response.text
            pattern = r"Ch[A-Za-z0-9%]+"

            res1 = re.compile(pattern).findall(res)
            for i in res1:
                if len(i) > 20:
                    print('二:这是第{}页的内容========================================================================================'.format(page))
                    m += 1
                    print(m)
                    print(i)
                    data = {
                        'Id': i
                    }
                    data = json.dumps(data)
                    if 'ChZDb' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Conference/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Conference_html)
                    elif 'ChlQZ' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Periodical/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Periodical_html)
                    elif 'ChJUa' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Thesis/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Thesis_html)
                    elif 'ChJQY' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Patent/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Patent_html)
                    elif 'ChFDc' in i:
                        yield scrapy.Request(url='http://d.wanfangdata.com.cn/Detail/Cstad/',method='POST', body=data,meta={'Id':i}, dont_filter=True, callback=self.Cstad_html)

# 【需要回调的函数(Periodical_html)/】
    def Periodical_html(self,response):
        meta = response.meta
        res = json.loads(response.text)
        # print(res)
        print('=============开始解析详细的详情页==========================')
        url = 'http://d.wanfangdata.com.cn/periodical/' + meta['Id']
        # name_1 = response.meta['name']
        html = json.loads(response.text)
        text_html = html['detail'][0]['periodical']
        # print(text_html)
        """ 作者 """
        authors = ''
        try:
            authors_names = str(text_html['authors_name'])
        except:
            try:
                authors = str(text_html['Creator'])
            except:
                authors = ''
            else:
                authors = authors.replace('[', '').replace(']', '').replace('\'', '').replace('"','')
        else:
            authors = authors_names.replace('[', '').replace(']', '').replace('\'', '')


        authorunits = ''
        """ 作者及作者单位 """
        # 需要回调搜索函数,把作者及单位带回到参数当中
        try:
            authorsandunits = text_html['AuthorOrg']
        except:
            try:
                authorsandunits = text_html['authorsandunit']
            except:
                authorunits = ''
            else:
                # for au in authorsandunits:
                #     print(au)
                #     yield scrapy.Request(url='http://d.wanfangdata.com.cn',meta={'key':au},dont_filter=True,callback=self.double_requests)
                authorunits = str(authorsandunits).replace('[', '').replace(']', '').replace('\'', '')
        else:
            # for au in authorsandunits:
            #     print(au)
            #     yield scrapy.Request(url='http://d.wanfangdata.com.cn',meta={'key':au},dont_filter=True,callback=self.double_requests)
            authorunits = str(authorsandunits).replace('[', '').replace(']', '').replace('\'', '')



        """  分类号  """
        classcode4search = ''
        try:
            classcode4search = str(text_html['OriginalClassCode'])
        except:
            try:
                classcode4search = str(text_html['classcode4search'])
            except:
                classcode4search = ''
            else:
                classcode4search = classcode4search.replace('[', '').replace(']', '').replace('\'', '')
        else:
            classcode4search = classcode4search.replace('[', '').replace(']', '').replace('\'', '')

        """ 刊名 """
        PeriodicalTitle= ''
        try:
            PeriodicalTitle = str(text_html['perio_title'])
        except:
            try:
                PeriodicalTitle = str(text_html['PeriodicalTitle'][0])
            except:
                PeriodicalTitle = ''
            

        """ 关键字 """
        keywords = ''
        try:
            # keyword = str(text_html['keywords'])
            keyword = str(text_html['Keywords'])
        except:
            try:
                keywords = str(text_html['MachinedKeywords'])
            except:
                keywords = ''
            else:
                keywords = keywords.replace('[', '').replace(']', '').replace('\'', '').replace('"', '')
        else:
            keywords = keyword.replace('[', '').replace(']', '').replace('\'', '').replace('"', '')


        """ 题目 """
        try:
            title = str(text_html['title'][0]).replace('[', '').replace(']', '').replace('\'', '').replace('<','').replace('>','').replace('/','').replace('em','')
        except:
            title = str(text_html['Title'][0]).replace('[', '').replace(']', '').replace('\'', '').replace('<','').replace('>','').replace('/','').replace('em','')

        """ 目的 """
        summary = ''
        try:
            summary = str(text_html['summary']).replace('[', '').replace(']', '').replace('\'', '').replace('"', '').replace('\r', '').replace('\n', '')
        except:
            try:
                summary = str(text_html['Abstract']).replace('[', '').replace(']', '').replace('\'', '').replace('"', '').replace('\r', '').replace('\n', '')
            except:
                summary = ''

        """ 年卷期 """
        Pvi = ''
        try:
            # PublishYear = str(text_html['publish_year'])
            PublishYear = str(text_html['PublishYear'])
        except:
            print('这一篇没有年的内容')
        else:
            Pvi = PublishYear
            try:
                # Issue = text_html['issue_num'].replace('"', '')
                Issue = text_html['Issue'].replace('"', '')
            except:
                print('这一篇没有期的内容')
            else:
                Pvi = PublishYear + ',' + '(' + Issue + ')'
                try:
                    # Volum = text_html['volume'].replace('"', '')
                    Volum = text_html['Volum'].replace('"', '')
                except:
                    print('这一篇没有卷的内容')
                else:
                    Pvi = PublishYear + ',' + Volum + '(' + Issue + ')'

        """ 页数 """
        page_cnt = ''
        try:
            page_cnt = str(text_html['page_cnt'])
        except:
            try:
                page_cnt = str(text_html['PageNo'])
            except:
                page_cnt = ''

        """ 页码 """
        page_range = ''
        try:
            page_range = str(text_html['page_range'])
        except:
            try:
                page_range = str(text_html['Page'])
            except:
                page_range = ''

        print('url={}'.format(url))
        print('作者={}'.format(authors))
        print('作者及作者单位={}'.format(authorunits))
        print('分类号={}'.format(classcode4search))
        print('刊名={}'.format(PeriodicalTitle))
        print('关键字={}'.format(keywords))
        print('题目={}'.format(title))
        print('目的={}'.format(summary))
        print('页数={}'.format(page_cnt))
        print('页码={}'.format(page_range))
        print('年卷期={}'.format(Pvi))



        path_0 = os.path.join(r'/baiyu/app/doc-3.0','periodical')
        """ 创建periodical文件夹 """
        if not os.path.exists(path_0):
            os.mkdir(path_0)
        
        path_01 = os.path.join(path_0,'期刊')
        """ 创建期刊文件夹 """
        if not os.path.exists(path_01):
            os.mkdir(path_01)
        
        path_1 = os.path.join(path_01,PeriodicalTitle)
        """ 创建期刊名文件夹 """
        if not os.path.exists(path_1):
            os.mkdir(path_1)

        path_2 = os.path.join(path_1,PublishYear)
        """ 创建时间文件夹 """
        if not os.path.exists(path_2):
            os.mkdir(path_2)
        if title:
            """ 创建明细期刊文件 """
            filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
            filename = re.sub('[\/:*?"<>|]','-',filename)
            with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
                fp.write(url + '\n' +authors + '\n' +authorunits + '\n' +classcode4search + '\n' +PeriodicalTitle + '\n' +keywords + '\n' +title + '\n' +summary + '\n' +page_cnt + '\n' +page_range + '\n' +Pvi)

        Periodical_item = PeriodicalItem(
            url = url,
            author = authors,
            unit = authorunits,
            classcode4search = classcode4search,
            periodical = PeriodicalTitle,
            keyword = keywords,
            title = title,
            abstract = summary,
            page = page_cnt,
            pagenumber = page_range,
            year_issue_volume = Pvi,
        )
        yield Periodical_item

 
  
    def Patent_html(self,response):
        url = 'http://d.wanfangdata.com.cn/patent/' + response.meta['Id']
        # name_1 = response.meta['name']
        html = json.loads(response.text)
        text_html = html['detail'][0]['patent']
        # print(text_html)
        """ 专利名称 """
        title = str(text_html['Title']).replace('[','').replace(']','').replace('\'','')
        """ 摘要 """
        abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','')
        """ 专利类型 """
        PatentType = text_html['PatentType']
        """ 申请/专利号 """
        PatentCode = text_html['PatentCode']
        """ 申请日期 """
        ApplicationDate = text_html['ApplicationDate']
        """ 公开/公告号 """
        PublicationNo = text_html['PublicationNo']
        """ 公开/公告日 """
        PublicationDate = text_html['PublicationDate']
        """ 申请/专利权人 """
        Applicant = str(text_html['Applicant']).replace('[','').replace(']','').replace('\'','')
        """ 发明/设计人 """
        Inventor = str(text_html['Inventor']).replace('[','').replace(']','').replace('\'','')
        """ 主权项 """
        SignoryItem = text_html['SignoryItem']

        print(url)
        print(title)
        print(abstract)
        print(PatentType)
        print(PatentCode)
        print(ApplicationDate)
        print(PublicationNo)
        print(PublicationDate)
        print(Applicant)
        print(Inventor)
        print(SignoryItem)



        path_0 = os.path.join(r'/baiyu/app/doc-2.0','thesis')
        """ 创建thesis文件夹 """
        if not os.path.exists(path_0):
            os.mkdir(path_0)
        
        path_1 = os.path.join(path_0,'学位')
        """ 创建期刊文件夹 """
        if not os.path.exists(path_1):
            os.mkdir(path_1)
        

        ti = str(publishyear)
        path_2 = os.path.join(path_1,ti)
        """ 创建时间文件夹 """
        if not os.path.exists(path_2):
            os.mkdir(path_2)
        if title:
            """ 创建明细期刊文件 """
            filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
            filename = re.sub('[\/:*?"<>|]','-',filename)
            filename = filename.replace('[','').replace(']','').replace('\'','')
            with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
                fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +major + '\n' +tutor + '\n' +publishyear + '\n' +degree)




        Patent_item = PatentItem(
            url = url,
            title = title,
            abstract = abstract,
            patentType = PatentType,
            patentCode = PatentCode,
            applicationDate = ApplicationDate,
            publicationNo = PublicationNo,
            publicationDate = PublicationDate,
            applicant = Applicant,
            inventor = Inventor,
            signoryItem = SignoryItem
        )
        yield Patent_item




    def Cstad_html(self,response):
        meta = response.meta
        new_html = json.loads(response.text)
        html_data = new_html['detail'][0]['Cstad']
        url = 'http://d.wanfangdata.com.cn/cstad/' + meta['Id']
    
        #题目
        title =  html_data['Title'][0]
        
        #项目年度编号
        Cstad_id =  str(html_data['Id'])
        
        #成果类别
        achievementType =  html_data['AchievementType']
    
        #成果公布年份
        publishYear =  str(html_data['PublishYear'])

        #完成单位
        organizations =  str(html_data['Organization']).replace('[','').replace(']','').replace('\"','').replace('\'','')
        
        #应用行业名称
        tradeName =  str(html_data['TradeName']).replace('[','').replace(']','').replace('\"','').replace('\'','')

        
        #应用行业码
        tradeCode =  str(html_data['TradeCode']).replace('[','').replace(']','').replace('\"','').replace('\'','')
        
        #联系单位名称
        contactUnit =  html_data['ContactUnit']
        
        #联系单位地址
        contactAddress =  html_data['ContactAddress']

        #成果简介
        abstract = str(html_data['Abstract']).replace('[','').replace(']','').replace('\"','').replace('\'','')
    
        #关键词
        keyword = str(html_data['Keywords']).replace('[','').replace(']','').replace('\"','').replace('\'','')

        #作者
        author = str(html_data['Creator']).replace('[','').replace(']','').replace('\"','').replace('\'','')

        print(url)
        print(title)
        print(Cstad_id)
        print(achievementType)
        print(publishYear)
        print(organizations)
        print(tradeName)
        print(tradeCode)
        print(contactUnit)
        print(contactAddress)
        print(abstract)
        print(keyword)
        print(author)

        path_0 = os.path.join(r'/baiyu/app/doc-2.0','cstad')
        """ 创建cstad文件夹 """
        if not os.path.exists(path_0):
            os.mkdir(path_0)
        
        path_1 = os.path.join(path_0,'成果')
        """ 创建成果文件夹 """
        if not os.path.exists(path_1):
            os.mkdir(path_1)

        path_2 = os.path.join(path_1,publishYear)
        """ 创建时间文件夹 """
        if not os.path.exists(path_2):
            os.mkdir(path_2)
        if title:
            """ 创建明细期刊文件 """
            filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
            filename = re.sub('[\/:*?"<>|]','-',filename)
            with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
                fp.write(url + '\n' +title + '\n' +Cstad_id + '\n' +achievementType + '\n' +publishYear + '\n' +organizations + '\n' +tradeName + '\n' +tradeCode + '\n' +contactUnit + '\n' + contactAddress+ '\n' + abstract+ '\n' + keyword+ '\n' + author)


        Cstad_item = CstadItem(
            url = url,
            title = title,
            Cstad_id = Cstad_id,
            achievementType = achievementType,
            publishYear = publishYear,
            organizations = organizations,
            tradeName = tradeName,
            tradeCode = tradeCode,
            contactUnit = contactUnit,
            contactAddress = contactAddress,
            abstract = abstract,
            keyword = keyword,
            author = author,
        )

        yield Cstad_item
       
    def Conference_html(self,response):
        url = 'http://d.wanfangdata.com.cn/conference/' + response.meta['Id']
        html = json.loads(response.text)
        text_html = html['detail'][0]['conference']

        """ 学位论文题目 """
        title = text_html['Title'][0]

        """ 摘要 """
        abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','').replace('\\u3000\\u3000','')

        """ 作者 """
        author = str(text_html['Creator']).replace('[','').replace(']','').replace('\'','')

        """ 机标分类号 """
        classcode = str(text_html['MachinedClassCode']).replace('[','').replace(']','').replace('\'','')
        if classcode == '':
            classcode = str(text_html['ClassCode']).replace('[','').replace(']','').replace('\'','')

        """ 论文关键字 """
        keywords = str(text_html['Keywords']).replace('[','').replace(']','').replace('\'','')
        if keywords == '':
            keywords = str(text_html['MachinedKeywords']).replace('[','').replace(']','').replace('\'','')
        
        """ 作者及单位 """
        authororg = str(text_html['AuthorOrg']).replace('[','').replace(']','').replace('\'','')

        """ 会议时间 """
        publisdate = str(text_html['MeetingDate'])
        """ 会议地点 """
        publishyear = str(text_html['MeetingArea'])

        """ 页码 """
        page = str(text_html['Page'])



        print(url)
        print(title)
        print(abstract)
        print(author)
        print(classcode)
        print(keywords)
        print(authororg)
        print(publisdate)
        print(publishyear)
        print(page)




        path_0 = os.path.join(r'/baiyu/app/doc-2.0','conference')
        """ 创建conference文件夹 """
        if not os.path.exists(path_0):
            os.mkdir(path_0)
        
        path_1 = os.path.join(path_0,'会议')
        """ 创建期刊文件夹 """
        if not os.path.exists(path_1):
            os.mkdir(path_1)   

        ti = str(publisdate)
        path_2 = os.path.join(path_1,ti)
        """ 创建时间文件夹 """
        if not os.path.exists(path_2):
            os.mkdir(path_2)
        if title:
            """ 创建明细期刊文件 """
            filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
            filename = re.sub('[\/:*?"<>|]','-',filename)
            with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
                fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +publisdate + '\n' +publishyear + '\n' +page)




        Conference_item = ConferenceItem(
            url = url,
            title = title,
            abstract = abstract,
            author = author,
            classcode = classcode,
            keywords = keywords,
            authororg = authororg,
            publisdate = publisdate,
            publishyear = publishyear,
            page = page,
        )
        yield Conference_item
       
    def Thesis_html(self,response):
        url = 'http://d.wanfangdata.com.cn/thesis/' + response.meta['Id']
        # name_1 = response.meta['name']
        html = json.loads(response.text)
        text_html = html['detail'][0]['thesis']
        # print(text_html)
        """ 学位论文题目 """
        title = text_html['Title'][0]

        """ 论文简介 """
        abstract = str(text_html['Abstract']).replace('[','').replace(']','').replace('\'','').replace('\r','').replace('\n','')

        """ 论文作者 """
        author = str(text_html['Creator']).replace('[','').replace(']','').replace('\'','')

        """ 论文分类号 """
        classcode = str(text_html['MachinedClassCode']).replace('[','').replace(']','').replace('\'','')
        if classcode == '':
            classcode = str(text_html['ClassCode']).replace('[','').replace(']','').replace('\'','')

        """ 论文关键字 """

        keywords = str(text_html['Keywords']).replace('[','').replace(']','').replace('\'','')
        if keywords == '':
            keywords = str(text_html['MachinedKeywords']).replace('[','').replace(']','').replace('\'','')
        
        """ 作者及单位 """
        authororg = str(text_html['AuthorOrg']).replace('[','').replace(']','').replace('\'','')

        """ 学科专业 """
        major = text_html['Major']

        """ 导师姓名 """
        tutor = str(text_html['Tutor']).replace('[','').replace(']','').replace('\'','')

        """ 学位年度 """
        publishyear = str(text_html['PublishYear'])

        """ 授予学位 """
        degree = text_html['Degree']


        print(url)
        print(title)
        print(abstract)
        print(author)
        print(classcode)
        print(keywords)
        print(authororg)
        print(major)
        print(tutor)
        print(publishyear)
        print(degree)



        path_0 = os.path.join(r'/baiyu/app/doc-2.0','thesis')
        """ 创建thesis文件夹 """
        if not os.path.exists(path_0):
            os.mkdir(path_0)
        
        path_1 = os.path.join(path_0,'学位')
        """ 创建期刊文件夹 """
        if not os.path.exists(path_1):
            os.mkdir(path_1)

        ti = str(publishyear)
        path_2 = os.path.join(path_1,ti)
        """ 创建时间文件夹 """
        if not os.path.exists(path_2):
            os.mkdir(path_2)
        if title:
            """ 创建明细期刊文件 """
            filename = str(re.compile('[\u4e00-\u9fff]+').findall(title))
            filename = re.sub('[\/:*?"<>|]','-',filename)
            filename = filename.replace('[','').replace(']','').replace('\'','')
            with open(path_2 + '\\' + filename + '.txt','a',encoding='utf8') as fp:
                fp.write(url + '\n' +title + '\n' +abstract + '\n' +author + '\n' +classcode + '\n' +keywords + '\n' +authororg + '\n' +major + '\n' +tutor + '\n' +publishyear + '\n' +degree)




        Thesis_item = ThesisItem(
            url = url,
            title = title,
            abstract = abstract,
            author = author,
            classcode = classcode,
            keywords = keywords,
            authororg = authororg,
            major = major,
            tutor = tutor,
            publishyear = publishyear,
            degree = degree
        )
        yield Thesis_item
posted @ 2021-07-02 10:16  missking  阅读(396)  评论(0编辑  收藏  举报