bug-bug-bug

#-*-coding:utf-8-*-
import urllib
import urllib2
import re
import json
import threading
import requests
from lxml import etree
from time import sleep,ctime
from Queue import Queue
import lxml
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product

class Get_Html_Pthread(threading.Thread):
    def __init__(self,threadid,que):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.que = que
    def run(self):
        self.gethtml()

    def gethtml(self):
        while True:
            if self.que.empty():
                break
            else:
                page = self.que.get()
                print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                print url
                headers = {
                    'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                num_try = 4
                while num_try > 0:
                    num_try -= 1
                    try:
                        content = requests.get(url, headers=headers)
                        data_que.put(content.text)
                        break
                    except Exception, e:
                        print 'qiushi_spider', e
                if num_try > 0:
                    print 'timeout:' + url


class Get_Message_Pthread(threading.Thread):
    def __init__(self,threadid,que,lock,f):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.lock = lock
        self.que = que
        self.f = f
    def run(self):
        global total,exitFlag_Parser
        while exitFlag_Parser == False:
            try:
                html = self.que.get(False)
                if not html:
                    pass
                self.getmessage(html)
                self.que.task_done()
            except:
                pass

    def getmessage(self,html1):
        global total
        try:
            html = etree.HTML(html1)
            result = html.xpath('//div[contains(@id,"qiushi_tag")]')
            for each in result:
                comment_res = each.xpath('.//span')[0].text
                name = each.xpath('.//h2')[0].text
                resultq = {
                    'author':name,
                    'phrase':comment_res,
                }
                print resultq
                with self.lock:
                    self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "\n")

        except Exception,e:
            print 'paeser_data',e

        with self.lock:
            total += 1

data_que = Queue()
lock = threading.Lock()
exitFlag_Parser = False
total = 0
def main():
    output = open('Phrase.json', 'a')
    pageque = Queue(60)
    for page in range(1,11):
        pageque.put(page)
    gethtmlpthread = []
    List = [0,1,2,3,4,5]
    for threadid in range(5):
        thread = Get_Html_Pthread(threadid,pageque)
        thread.start()
        gethtmlpthread.append(thread)

    getmessagepthread = []


    for threadid in range(5):
        thread = Get_Message_Pthread(threadid,data_que,lock,output)
        thread.start()
        getmessagepthread.append(thread)

    while not pageque.empty():
        pass

    for t in  gethtmlpthread:
        t.join()

    while not data_que.empty():
        pass

    for t in gethtmlpthread:
        t.join()
    with lock:
        output.close()

if __name__ == '__main__':
    global total
    main()
    print 'total'+ str(total)

 

posted on 2017-10-13 20:11  `Elaine  阅读(224)  评论(0编辑  收藏  举报

导航