python 微信爬虫实例

单线程版:

 1 import  urllib.request
 2 import urllib.parse
 3 import urllib.error
 4 import re,time
 5 headers = ("User-Agent",
 6            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
 7 operner = urllib.request.build_opener()
 8 operner.addheaders = [headers]
 9 urllib.request.install_opener(operner)
10  
11 list_url = []
12  
13  
14 ###使用代理获取网页url内容
15 def use_proxy(url):
16     try:
17         # proxy = urllib.request.ProxyHandler({'http':proxy_addr})    ##使用代理版
18         # operner = urllib.request.build_opener()
19         # urllib.request.install_opener(operner)
20         headers = ("User-Agent",
21                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
22         operner = urllib.request.build_opener()
23         operner.addheaders = [headers]
24         urllib.request.install_opener(operner)
25         data = urllib.request.urlopen(url).read().decode('utf-8')
26         # print (data)
27         return data
28     except urllib.error.URLError as e:
29         if hasattr(e, "code"):
30             print(e.code)
31         elif hasattr(e, "reason"):
32             print(e.reason)
33  
34     except Exception as e:
35         print("exception" + str(e))
36         time.sleep(1)
37  
38 ##获取要爬取的url
39 def get_url(key, pagestart, pageend):
40     try:
41  
42         keycode = urllib.parse.quote(key)
43  
44         for page in range(pagestart, pageend + 1):
45             url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (
46             keycode, page)
47             data1 = use_proxy(url)
48             #print("data1的内容是", data1)
49             listurl_pattern = '<h3>.*?("http://.*?)</h3>'
50             result = re.compile(listurl_pattern, re.S).findall(data1)
51             for i in range(len(result)):
52                 res = result[i].replace("amp;", "").split(" ")[0].replace("\"", "")
53                 list_url.append(res)
54         #print(list_url)
55         return list_url
56     except urllib.error.URLError as e:
57         if hasattr(e, "code"):
58             print(e.code)
59         elif hasattr(e, "reason"):
60             print(e.reason)
61     except Exception as e:
62         print("exception:", e)
63  
64 ##通过获取的url爬行内容数据并处理
65 def get_url_content(list_url):
66     fh1=open("D:\\python-script\\1.html", 'wb')
67     html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''
68     fh1.write(html1.encode("utf-8"))
69     fh1.close()
70     fh = open("D:\\python-script\\1.html", 'ab')
71     for url in list_url:
72         data_content = use_proxy(url)
73         #print (data_content)
74         #sys.exit()
75         title_pattern = '<h2.*>.*?</h2>'
76         result_title = re.compile(title_pattern, re.S).findall(data_content)
77         ##标题(str)
78         res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>",
79                                                                                           "").strip()
80  
81         content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
82         content = re.compile(content_pattern, re.S).findall(data_content)
83  
84         try:
85             fh.write(res_title.encode("utf-8"))
86             for i in content:
87                 fh.write(i.strip().encode("utf-8"))           
88         except UnicodeEncodeError as e:
89             continue
90  
91     fh.write("</body></html>".encode("utf-8"))
92  
93 if __name__ == '__main__':
94     pagestart = 1
95     pageend = 2
96     key = "人工智能"
97     get_url(key, pagestart, pageend)
98     get_url_content(list_url)
View Code

 多线程版:

import  urllib.request
import urllib.parse
import urllib.error
import re,time
import queue
import threading

headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)

urlque = queue.Queue()
list_url = []

###使用代理获取网页url内容
def use_proxy(url):
    try:
        # proxy = urllib.request.ProxyHandler({'http':proxy_addr})
        # operner = urllib.request.build_opener()
        # urllib.request.install_opener(operner)
        headers = ("User-Agent",
                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
        operner = urllib.request.build_opener()
        operner.addheaders = [headers]
        urllib.request.install_opener(operner)
        data = urllib.request.urlopen(url).read().decode('utf-8')
        #print (data)
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print (e.code)
        elif hasattr(e,"reason"):
            print (e.reason)

    except Exception as e:
        print ("exception"+str(e))
        time.sleep(1)



###获取文章的url连接,并将连接加入到队列
class get_url(threading.Thread):
    def __init__(self,key,pagestart,pageend,urlque):
        threading.Thread.__init__(self)
        self.pagestart = pagestart
        self.pageend = pageend
        self.key = key
        self.urlque = urlque

    def run(self):
        try:
            keycode = urllib.parse.quote(self.key)

            for page in range(self.pagestart,self.pageend+1):
                url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode,page)
                data = use_proxy(url)
                print ("data1的内容是",data)
                listurl_pattern = '<h3>.*?("http://.*?)</h3>'
                result = re.compile(listurl_pattern,re.S).findall(data)
                print (result)
                if len(result) == 0:
                    print ("没有可用的url")
                    sys.exit()
                for i in range(len(result)):
                    res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")
                    #list_url.append(res)       #加入列表
                    self.urlque.put(res)            ##加入队列
                    self.urlque.task_done()

            #return list_url
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            elif hasattr(e, "reason"):
                print(e.reason)
        except Exception as e:
            print ("exception:",e)

##根据url获取文章内容
class get_url_content(threading.Thread):
    def __init__(self,urlque):
        threading.Thread.__init__(self)
        self.urlque = urlque

    def run(self):
        fh1 = open("D:\\python-script\\1.html", 'wb')
        html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''
        fh1.write(html1.encode("utf-8"))
        fh1.close()
        fh = open("D:\\python-script\\1.html", 'ab')
        while True:
            try:
                url = self.urlque.get()
                data_content = use_proxy(url)

                title_pattern = '<h2.*>.*?</h2>'
                result_title = re.compile(title_pattern, re.S).findall(data_content)
                ##标题
                res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>","").strip()

                content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
                content = re.compile(content_pattern, re.S).findall(data_content)
                #c = '<p style="max-width: 100%;box-sizing: border-box;min-height: 1em;text-indent: 2em;word-wrap: break-word !important;">'
                # for i in content:
                #     ##内容
                #     c_content=i.replace(c, "").replace("<br  /></p>", "").replace("</p>", "")

                fh.write(res_title.encode("utf-8"))
                for i in content:
                    fh.write(i.strip().encode("utf-8"))
            except UnicodeEncodeError as e:
                continue

            fh.close()
class contrl(threading.Thread):
    def __init__(self,urlqueue):
        threading.Thread.__init__(self)

        self.urlqueue = urlqueue
        while True:
            print ("程序正在执行")
            if self.urlqueue.empty():
                time.sleep(3)
                print ("程序执行完毕")
                exit()



if __name__ == '__main__':
    pagestart = 1
    pageend = 2
    key = "人工智能"
    get_url = get_url(key,pagestart,pageend,urlque)

    get_url.start()

    get_content = get_url_content(urlque)
    get_content.start()

    cntrol = contrl(urlque)
    cntrol.start()

  

posted @ 2017-12-27 14:31  FRESHMANS  阅读(697)  评论(0编辑  收藏  举报