清水河畔论坛二手帖子爬虫
1 # -*- coding:utf-8 -*- 2 ''' 3 清水河畔二手帖子+爬取二手交易帖子 4 ''' 5 import requests 6 import json 7 from bs4 import BeautifulSoup 8 import sys 9 import urllib 10 import re 11 from urllib import request,parse 12 import pymongo 13 #by 元帅 uestc 2018.2.28 14 class QSHSpider(object): 15 # 模拟登陆清水河畔 16 def __init__(self): 17 self.headers = { 18 'username':'', 19 'password':'', 20 'Cache - Control': '', 21 'Connection': 'keep - alive', 22 'Cookie':'', 23 'Host':'bbs.uestc.edu.cn', 24 'Referer':'http: // bbs.uestc.edu.cn / member.php?mod = logging & action = login', 25 'Upgrade-Insecure - Requests': '1', 26 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 27 } 28 29 # 模拟登陆 30 def login(self): 31 request_url = "http://bbs.uestc.edu.cn/member.php?mod=logging&action=login" 32 request_new = requests.get(request_url, headers=self.headers) 33 self.login_request = request_new.text 34 35 def get_shopurl(self): 36 bf = BeautifulSoup(self.login_request,'html.parser') 37 #print(bf) 38 shop_a = bf.find_all('div',id = 'hd')[0].find_all('ul',id = 'mn_F201_menu',class_='p_pop h_pop')[0].find_all('li')[0].find_all('a')[0] 39 self.shopurl = shop_a['href'] 40 print('登陆成功!') 41 print('\n') 42 print('您已进入二手帖子专题:' + self.shopurl) 43 44 def get_tieziurls(self): 45 #request_tiezi = requests.get(self.shopurl,headers = self.headers) 46 req = request.Request(url=self.shopurl,headers=self.headers, method="POST") 47 response = request.urlopen(req) 48 content = response.read() 49 res = r"<a.*?href=.*?<\/a>" 50 urls = re.findall(res, content.decode('utf-8')) 51 print('备选主题有:书籍资料;生活用品;交通工具;卡券虚拟;数码硬件;' 52 '拼单;物品租借;其他;版务/投诉;已解决;') 53 searcher = input("请输入需要查找的主题 ") 54 #获取a标签内内容 55 # res = r'<a .*?>(.*?)</a>' 56 # texts = re.findall(res, content.decode('utf-8')) 57 # for t in texts: 58 # print(t) 59 #获取a标签内超链接 60 #urls = re.findall(r'<a.*?href=.*?>\r\n(.+?)<span class="xg1 num">(.*?)</span><\/a>',re.S) 61 #bff = BeautifulSoup(request_tiezi.text,'html.parser') 62 #print(bff) 63 #shop_tc1 = bff.find_all('div',id = 'wp',class_ = 'wp')[0] 64 #.find_all('div',class_ = 'boardnav')[0].find_all('div',id = 'ct',class_ = 'wp cl')[0].find_all('div',class_ = 'mn') 65 #shop_tc1 = bff.select('.wp') 66 #print(shop_tc1) 67 #.find_all('ul',id = 'thread_types',class_ = 'ttp bm cl cttp',style = 'height: auto;')[0] 68 #.find_all('li').find_all('a')[0] 69 # shop_tiezi_url = shop_tc1['href'] 70 # print(shop_tiezi_url) 71 for u in urls: 72 if searcher in str(u): 73 ui = u.replace('amp;','') 74 #print(ui) 75 urls1 = re.findall(r'<a .*?>(.*?)</a>' , ui, re.I | re.S | re.M) 76 urls2 = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,ui,re.S |re.I |re.M) 77 print(urls1[0][0:8] + ' ' + urls2[0]) 78 self.tieziurls = urls2[0] 79 tags = re.findall(r"\d+\.?\d*",urls2[0]) 80 self.pagetag = tags[1] 81 #self.pagetag = re.sub("\D", "", urls2[0]) 82 #print(self.pagetag) 83 break 84 def get_tiezi(self): 85 #req_tiezi = request.Request(url=self.tieziurls,headers=self.headers, method="POST") 86 req_tiezi = request.Request(url=self.new_url, headers=self.headers, method="POST") 87 response_tiezi = request.urlopen(req_tiezi) 88 content_tiezi = response_tiezi.read() 89 res_tiezi = r"<a.*?href=.*?<\/a>" 90 urls_tiezi = re.findall(res_tiezi, content_tiezi.decode('utf-8')) 91 for tiezi in urls_tiezi: 92 if 'class="s xst"' in tiezi: 93 tiezi = tiezi.replace('amp;', '') 94 tiezi_title = re.findall(r'<a .*?>(.*?)</a>', tiezi, re.I | re.S | re.M) 95 tiezi_urls = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,tiezi,re.S |re.I |re.M) 96 print('\n\n') 97 print('帖子主题:' +tiezi_title[0] + ' ' + '帖子地址:' + tiezi_urls[0]) 98 self.tiezi_url = tiezi_urls[0] 99 infor.get_details() 100 #self.tiezi_data[] = '帖子地址' 101 #数据库 102 tiezi_data = {'帖子' : {'title': tiezi_title[0], 'url' : tiezi_urls[0]}} 103 client = pymongo.MongoClient('localhost',27017) 104 mydb = client['mydb'] 105 qingshuihepan = mydb['qingshuihepan'] 106 qingshuihepan.insert_one(tiezi_data) 107 def get_pages(self): 108 urls_based = 'http://bbs.uestc.edu.cn/forum.php?mod=forumdisplay&fid=61&typeid={}&filter=typeid&typeid={}&page={}' 109 for i in range(1,3): 110 self.new_url = urls_based.format(self.pagetag,self.pagetag,i) 111 print('\n\n') 112 print('第'+ str(i) + '页' + ' ' '本页网址:' + self.new_url) 113 infor.get_tiezi() 114 def get_details(self): 115 #print(self.tiezi_url) 116 print('本帖详细内容:') 117 req_detail = request.Request(url=self.tiezi_url, headers=self.headers, method="POST") 118 response_detail = request.urlopen(req_detail) 119 content_detail = response_detail.read() 120 #print(content_detail.decode('utf-8')) 121 bs = BeautifulSoup(content_detail,'html.parser') 122 print(bs.find_all(class_='t_f')[0].text.strip()) 123 #urls_detail = re.findall(r'<td class="t_f".*?>(.*?)</td>' , content_detail.decode('utf-8')) 124 # for ud in urls_detail: 125 # print(ud) 126 127 if "__main__" == __name__: 128 infor = QSHSpider() 129 infor.login() 130 print('登陆清水河畔ing……') 131 infor.get_shopurl() 132 infor.get_tieziurls() 133 #infor.get_tiezi() 134 infor.get_pages()