清水河畔论坛二手帖子爬虫

  1 # -*- coding:utf-8 -*-
  2 '''
  3 清水河畔二手帖子+爬取二手交易帖子
  4 '''
  5 import requests
  6 import json
  7 from bs4 import BeautifulSoup
  8 import sys
  9 import urllib
 10 import re
 11 from urllib import request,parse
 12 import pymongo
 13 #by 元帅 uestc 2018.2.28
 14 class QSHSpider(object):
 15     # 模拟登陆清水河畔
 16     def __init__(self):
 17         self.headers = {
 18         'username':'',
 19         'password':'',
 20         'Cache - Control': '',
 21         'Connection': 'keep - alive',
 22         'Cookie':'',
 23         'Host':'bbs.uestc.edu.cn',
 24         'Referer':'http: // bbs.uestc.edu.cn / member.php?mod = logging & action = login',
 25         'Upgrade-Insecure - Requests': '1',
 26         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
 27         }
 28 
 29     # 模拟登陆
 30     def login(self):
 31         request_url = "http://bbs.uestc.edu.cn/member.php?mod=logging&action=login"
 32         request_new = requests.get(request_url, headers=self.headers)
 33         self.login_request = request_new.text
 34 
 35     def get_shopurl(self):
 36         bf = BeautifulSoup(self.login_request,'html.parser')
 37         #print(bf)
 38         shop_a = bf.find_all('div',id = 'hd')[0].find_all('ul',id = 'mn_F201_menu',class_='p_pop h_pop')[0].find_all('li')[0].find_all('a')[0]
 39         self.shopurl = shop_a['href']
 40         print('登陆成功!')
 41         print('\n')
 42         print('您已进入二手帖子专题:' + self.shopurl)
 43 
 44     def get_tieziurls(self):
 45         #request_tiezi = requests.get(self.shopurl,headers = self.headers)
 46         req = request.Request(url=self.shopurl,headers=self.headers, method="POST")
 47         response = request.urlopen(req)
 48         content = response.read()
 49         res = r"<a.*?href=.*?<\/a>"
 50         urls = re.findall(res, content.decode('utf-8'))
 51         print('备选主题有:书籍资料;生活用品;交通工具;卡券虚拟;数码硬件;'
 52               '拼单;物品租借;其他;版务/投诉;已解决;')
 53         searcher = input("请输入需要查找的主题 ")
 54         #获取a标签内内容
 55         # res = r'<a .*?>(.*?)</a>'
 56         # texts = re.findall(res, content.decode('utf-8'))
 57         # for t in texts:
 58         #     print(t)
 59         #获取a标签内超链接
 60         #urls = re.findall(r'<a.*?href=.*?>\r\n(.+?)<span class="xg1 num">(.*?)</span><\/a>',re.S)
 61         #bff = BeautifulSoup(request_tiezi.text,'html.parser')
 62         #print(bff)
 63         #shop_tc1 = bff.find_all('div',id = 'wp',class_ = 'wp')[0]
 64         #.find_all('div',class_ = 'boardnav')[0].find_all('div',id = 'ct',class_ = 'wp cl')[0].find_all('div',class_ = 'mn')
 65         #shop_tc1 = bff.select('.wp')
 66         #print(shop_tc1)
 67         #.find_all('ul',id = 'thread_types',class_ = 'ttp bm cl cttp',style = 'height: auto;')[0]
 68         #.find_all('li').find_all('a')[0]
 69         # shop_tiezi_url = shop_tc1['href']
 70         # print(shop_tiezi_url)
 71         for u in urls:
 72             if searcher in str(u):
 73                 ui = u.replace('amp;','')
 74                 #print(ui)
 75                 urls1 = re.findall(r'<a .*?>(.*?)</a>' , ui, re.I | re.S | re.M)
 76                 urls2 = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,ui,re.S |re.I |re.M)
 77                 print(urls1[0][0:8] + '   ' + urls2[0])
 78                 self.tieziurls = urls2[0]
 79                 tags = re.findall(r"\d+\.?\d*",urls2[0])
 80                 self.pagetag = tags[1]
 81                 #self.pagetag = re.sub("\D", "", urls2[0])
 82                 #print(self.pagetag)
 83                 break
 84     def get_tiezi(self):
 85         #req_tiezi = request.Request(url=self.tieziurls,headers=self.headers, method="POST")
 86         req_tiezi = request.Request(url=self.new_url, headers=self.headers, method="POST")
 87         response_tiezi = request.urlopen(req_tiezi)
 88         content_tiezi = response_tiezi.read()
 89         res_tiezi = r"<a.*?href=.*?<\/a>"
 90         urls_tiezi = re.findall(res_tiezi, content_tiezi.decode('utf-8'))
 91         for tiezi in urls_tiezi:
 92             if 'class="s xst"' in tiezi:
 93                 tiezi = tiezi.replace('amp;', '')
 94                 tiezi_title = re.findall(r'<a .*?>(.*?)</a>', tiezi, re.I | re.S | re.M)
 95                 tiezi_urls = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,tiezi,re.S |re.I |re.M)
 96                 print('\n\n')
 97                 print('帖子主题:' +tiezi_title[0] + '  ' + '帖子地址:' + tiezi_urls[0])
 98                 self.tiezi_url = tiezi_urls[0]
 99                 infor.get_details()
100                 #self.tiezi_data[] = '帖子地址'
101                 #数据库
102                 tiezi_data = {'帖子' : {'title': tiezi_title[0], 'url' : tiezi_urls[0]}}
103                 client = pymongo.MongoClient('localhost',27017)
104                 mydb = client['mydb']
105                 qingshuihepan = mydb['qingshuihepan']
106                 qingshuihepan.insert_one(tiezi_data)
107     def get_pages(self):
108         urls_based = 'http://bbs.uestc.edu.cn/forum.php?mod=forumdisplay&fid=61&typeid={}&filter=typeid&typeid={}&page={}'
109         for i in range(1,3):
110             self.new_url = urls_based.format(self.pagetag,self.pagetag,i)
111             print('\n\n')
112             print(''+ str(i) + '' + '   ' '本页网址:' + self.new_url)
113             infor.get_tiezi()
114     def get_details(self):
115         #print(self.tiezi_url)
116         print('本帖详细内容:')
117         req_detail = request.Request(url=self.tiezi_url, headers=self.headers, method="POST")
118         response_detail = request.urlopen(req_detail)
119         content_detail = response_detail.read()
120         #print(content_detail.decode('utf-8'))
121         bs = BeautifulSoup(content_detail,'html.parser')
122         print(bs.find_all(class_='t_f')[0].text.strip())
123         #urls_detail = re.findall(r'<td class="t_f".*?>(.*?)</td>' , content_detail.decode('utf-8'))
124         # for ud in urls_detail:
125         #     print(ud)
126 
127 if "__main__" == __name__:
128     infor = QSHSpider()
129     infor.login()
130     print('登陆清水河畔ing……')
131     infor.get_shopurl()
132     infor.get_tieziurls()
133     #infor.get_tiezi()
134     infor.get_pages()

 

posted @ 2018-04-01 21:33  Edge_of_Eternity  阅读(821)  评论(0编辑  收藏  举报