python爬虫
1 #! /usr/bin/env python 2 #coding=utf-8 3 4 import requests 5 import re,json 6 import sys,os 7 import Queue,threading 8 from bs4 import BeautifulSoup 9 reload(sys) 10 sys.setdefaultencoding("utf8") 11 12 def http_req_get(siteurl): 13 headers = { 14 "Host": "www.xuebang.com.cn", 15 "Connection": "keep-alive", 16 "Cache-Control": "max-age=0", 17 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 18 "Upgrade-Insecure-Requests": "1", 19 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36", 20 "Accept-Encoding": "gzip, deflate, sdch", 21 "Accept-Language": "zh-CN,zh;q=0.8", 22 "Cookie": "__cfduid=da7335f4b0e760976f98697b651fc10041447572288; pgv_pvi=7944819712; deptNumOf11=140; deptNumOf89=60; deptNumOf711=60; commentNumOf11215=1074; deptNumOf1411=56; JSESSIONID=abcqLyMOLKEVDbynTTtev; a2666_pages=1; a2666_times=4; pgv_si=s4040530944; Hm_lvt_8147cdaed425fa804276ea12cd523210=1447572289,1447678990,1447734730; Hm_lpvt_8147cdaed425fa804276ea12cd523210=1447734730; CNZZDATA5928106=cnzz_eid%3D1168227404-1447570407-%26ntime%3D1447729389; Hm_lvt_863e19f68502f1ae0f9af1286bb12475=1447572289,1447678990,1447734730; Hm_lpvt_863e19f68502f1ae0f9af1286bb12475=1447734730; _ga=GA1.3.122575526.1447572289; _gat=1"} 23 try: 24 urlobj = requests.get( 25 url = siteurl, 26 headers = headers, 27 ) 28 return urlobj 29 except Exception,e: 30 print 'yichang' 31 pass 32 33 34 #def request_get(siteurl): 35 36 class LinksParser(object): 37 def __init__(self,urlobj): 38 self.urlobj = urlobj 39 self.soup = BeautifulSoup(self.urlobj.text, "html.parser") 40 41 #创建大学目录 42 def createDaXueDir(self): 43 #获取当前文件路径 如果不存在就创建文件夹 44 path = sys.argv[0] 45 current_dir = os.path.dirname(path) 46 real_path = current_dir + '\\' + self.soup.title.text.encode('gb18030') 47 if os.path.exists(real_path): 48 pass 49 else: 50 try: 51 os.mkdir(real_path) 52 except: 53 pass 54 return real_path 55 56 #获取学院 并把学校的院系写入文件 57 def xueyuan(self,path): 58 try: 59 fh = open(real_path + '/xueyuan.txt','wb') 60 for line in self.soup.find_all('a',{'class':'yxcologe'}):#.encode('gb18030') 61 fh.writelines(line.text.encode('gb18030').strip() + '\n') 62 fh.close() 63 except: 64 pass 65 66 #获取该学院的教师列表的url 67 def teacher(self,path): 68 lst = [] 69 length = len(self.soup.find_all('a',{'class':'yxcologe'})) 70 for i in xrange(length): 71 #依次获取每个院系的老师链接 72 url = self.soup.find_all('a',{'class':'yxcologe'})[i]['href'].encode('gb18030') 73 lst.append(url) 74 return lst 75 76 77 #获取所有教师列表 78 def teacher_lst(self): 79 length = len(self.soup.find('span',{'class','TJszlist'}).find_all('li')) 80 for i in xrange(length): 81 lst = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['title'] 82 #开始截取该系的名称 83 yuanxi = str(self.soup.find('span',{'class','t_dqwz'})) 84 yuanxi = yuanxi[-40:] 85 yuanxi = yuanxi.split('»')[1] 86 yuanxi = yuanxi.split('<')[0] 87 teacher_lst.append({'department':yuanxi,'name':lst}) 88 url = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['href'].encode('gb18030') 89 teacher_comment_url.append(url) 90 91 #获取教师和该教师下所有评论 92 def comment_teacher(self): 93 length = len(self.soup.find_all('span',{'class','TJR_info'})) 94 if length == 0: 95 return 'no comments' 96 else: 97 for i in xrange(length): 98 teacher_content = self.soup.find_all('span',{'class','TJR_info'})[i].find('p',{'class','TJlycon'}).text 99 teacher_name = self.soup.find(color='#0088cc').text 100 teacher_time = self.soup.find_all('span',{'class','TJR_info'})[i].find('span').string 101 teacher_all_comment.append({'teacher_id':teacher_name,'comment':teacher_content,'time':teacher_time}) 102 json_data = json.dumps(teacher_all_comment, encoding="UTF-8", ensure_ascii=False) 103 return json_data 104 105 class myThreads(threading.Thread): 106 def __init__(self,queue): 107 threading.Thread.__init__(self) 108 self.queue = queue 109 110 def run(self): 111 while True: 112 if self.queue.empty(): 113 break 114 else: 115 try: 116 url = self.queue.get_nowait() 117 res_obj = LinksParser(http_req_get(url)) 118 res_obj.teacher_lst() 119 except Exception,e: 120 break 121 class commentThreads(threading.Thread): 122 def __init__(self,queue): 123 threading.Thread.__init__(self) 124 self.queue = queue 125 126 def run(self): 127 while True: 128 if self.queue.empty(): 129 break 130 else: 131 try: 132 url = self.queue.get_nowait() 133 res_obj = LinksParser(http_req_get(url)) 134 test = res_obj.comment_teacher() 135 fh = open(real_path + '/teacher_comment_lst.txt','wb') 136 fh.write(test) 137 fh.close() 138 except Exception,e: 139 break 140 141 if __name__ == '__main__': 142 #输入学院的ID 143 #i = sys.argv[1] 144 idlist = [11, 129, 70, 71] 145 for i in idlist: 146 i = str(i) 147 thread_number = 50 148 url = 'http://www.xuebang.com.cn/' + i + '/deptlist' 149 try: 150 urlobj = http_req_get(url) 151 #生成院系列表 152 response_obj = LinksParser(urlobj) 153 #文件所保存的路径 154 real_path = response_obj.createDaXueDir() 155 response_obj.xueyuan(real_path) 156 157 #获取该学院的教师列表的url 即每个系学院下面的教师 158 xi_to_teacher = response_obj.teacher(real_path) 159 #获取所有的教师列表 160 161 162 global teacher_lst 163 teacher_lst = [] 164 165 166 #教师comment链接 167 global teacher_comment_url 168 teacher_comment_url = [] 169 170 #多线程加快速度 171 queue = Queue.Queue() 172 for line in xi_to_teacher: 173 queue.put(line) 174 threads = [] 175 for i in xrange(thread_number): 176 threads.append(myThreads(queue)) 177 for t in threads: 178 t.start() 179 for t in threads: 180 t.join() 181 teacher_lst = json.dumps(teacher_lst, encoding="UTF-8", ensure_ascii=False) 182 #print len(teacher_lst) 183 184 #将教师写入文件 185 try: 186 fh = open(real_path + '/teacher_lst.txt','wb') 187 fh.write(teacher_lst) 188 fh.close() 189 except: 190 pass 191 192 global teacher_all_comment 193 teacher_all_comment = [] 194 #获取教师及教师评论 195 queu = Queue.Queue() 196 for line_url in teacher_comment_url: 197 queu.put(line_url) 198 199 comments = [] 200 for line in xrange(thread_number): 201 comments.append(commentThreads(queu)) 202 for t in comments: 203 t.start() 204 for t in comments: 205 t.join() 206 except: 207 pass 208 209 #print teacher_all_comment