百度图片爬虫-python版
1 #coding:utf-8
2
3 """
4
5 Created on 2015-9-17
6
7
8
9 @author: huangxie
10
11 """
12
13 import time,math,os,re,urllib,urllib2,cookielib
14
15 from bs4 import BeautifulSoup
16
17 import time
18
19 import re
20
21 import uuid
22
23 import json
24
25 from threading import Thread
26
27 from Queue import Queue
28
29 import MySQLdb as mdb
30
31 import sys
32
33 import threading
34
35 import utils
36
37 import imitate_browser
38
39 from MySQLdb.constants.REFRESH import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding('utf-8')
44
45
46
47 DB_HOST = '127.0.0.1'
48
49 DB_USER = 'root'
50
51 DB_PASS = 'root'
52
53 proxy = {u'http':u'222.39.64.13:8118'}
54
55 TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
56
57 KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"
58
59
60
61 """
62
63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65 'Accept':'json;q=0.9,*/*;q=0.8',
66
67 'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69 'Accept-Encoding':'gzip',
70
71 'Connection':'close',
72
73 'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
74
75 }
76
77 """
78
79 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
80
81
82
83 def GetDateString():
84
85 x = time.localtime(time.time())
86
87 foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
88
89 return foldername
90
91
92
93 class BaiduImage(threading.Thread):
94
95
96
97 def __init__(self):
98
99 Thread.__init__(self)
100
101 self.browser=imitate_browser.BrowserBase()
102
103 self.chance=0
104
105 self.chance1=0
106
107 self.request_queue=Queue()
108
109 self.wait_ana_queue=Queue()
110
111 #self.key_word_queue.put((("动态图", 0, 24)))
112
113 self.count=0
114
115 self.mutex = threading.RLock() #可重入锁,使单线程可以再次获得已经获得的锁
116
117 self.commit_count=0
118
119 self.ID=500
120
121 self.next_proxy_set = set()
122
123 self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')
124
125 self.dbconn.autocommit(False)
126
127 self.dbcurr = self.dbconn.cursor()
128
129 self.dbcurr.execute('SET NAMES utf8')
130
131
132
133 """
134
135 def run(self):
136
137 while True:
138
139 self.get_pic()
140
141 """
142
143
144
145 def work(self,item):
146
147 print "start thread",item
148
149 while True: #MAX_REQUEST条以上则等待
150
151 self.get_pic()
152
153 self.prepare_request()
154
155
156
157 def format_keyword_url(self,keyword):
158
159
160
161 return KEYWORD_URL.format(wd=keyword).encode('utf-8')
162
163
164
165 def generateSeed(self,url):
166
167
168
169 html = self.browser.openurl(url).read()
170
171 if html:
172
173 try:
174
175 soup = BeautifulSoup(html)
176
177 trs = soup.find('div', id='rs').find('table').find_all('tr') #获得所有行
178
179 for tr in trs:
180
181 ths=tr.find_all('th')
182
183 for th in ths:
184
185 a=th.find_all('a')[0]
186
187 keyword=a.text.strip()
188
189 if "动态图" in keyword or "gif" in keyword:
190
191 print "keyword",keyword
192
193 self.dbcurr.execute('select id from info where word=%s',(keyword))
194
195 y = self.dbcurr.fetchone()
196
197 if not y:
198
199 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))
200
201 self.dbconn.commit()
202
203 except:
204
205 pass
206
207
208
209
210
211 def prepare_request(self):
212
213 self.lock()
214
215 self.dbcurr.execute('select * from info where status=0')
216
217 result = self.dbcurr.fetchone()
218
219 if result:
220
221 id,word,status,page_num,left_num,how_many=result
222
223 self.request_queue.put((id,word,page_num))
224
225 if page_num==0 and left_num==0 and how_many==0:
226
227 url=self.format_keyword_url(word)
228
229 self.generateSeed(url)
230
231 html=""
232
233 try:
234
235 url=self.format_top_url(word, page_num, 24)
236
237 html = self.browser.openurl(url).read()
238
239 except Exception as err:
240
241 print "err",err
242
243 #pass
244
245 if html!="":
246
247 how_many=self.how_many(html)
248
249 print "how_many",how_many
250
251 if how_many==None:
252
253 how_many=0
254
255 t=math.ceil(how_many/24*100) #只要前1/100即可
256
257 num = int(t)
258
259 for i in xrange(0,num-1):
260
261 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
262
263 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问
264
265 self.dbconn.commit()
266
267 self.unlock()
268
269
270
271
272
273 def start_work(self,req_max):
274
275 for item in xrange(req_max):
276
277 t = threading.Thread(target=self.work, args=(item,))
278
279 t.setDaemon(True)
280
281 t.start()
282
283
284
285 def lock(self): #加锁
286
287 self.mutex.acquire()
288
289
290
291 def unlock(self): #解锁
292
293 self.mutex.release()
294
295
296
297 def get_para(self,url,key):
298
299 values = url.split('?')[-1]
300
301 for key_value in values.split('&'):
302
303 value=key_value.split('=')
304
305 if value[0]==key:
306
307 return value[1]
308
309 return None
310
311
312
313 def makeDateFolder( self,par,child):
314
315 #self.lock()
316
317 if os.path.isdir( par ):
318
319 path=par + '//' + GetDateString()
320
321 newFolderName = path+'//'+child
322
323 if not os.path.isdir(path):
324
325 os.mkdir(path)
326
327 if not os.path.isdir( newFolderName ):
328
329 os.mkdir( newFolderName )
330
331 return newFolderName
332
333 else:
334
335 return par
336
337 #self.unlock()
338
339
340
341 def parse_json(self,data):
342
343
344
345 ipdata = json.loads(data)
346
347 try:
348
349 if ipdata['imgs']:
350
351 for n in ipdata['imgs']: #data子项
352
353 if n['objURL']:
354
355 try:
356
357 proxy_support = urllib2.ProxyHandler(proxy)
358
359 opener = urllib2.build_opener(proxy_support)
360
361 urllib2.install_opener(opener)
362
363 #print "proxy",proxy
364
365 self.lock()
366
367 self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
368
369 y = self.dbcurr.fetchone()
370
371 #print "y=",y
372
373 if y:
374
375 print "database exist"
376
377 self.unlock() #continue 前解锁
378
379 continue
380
381 else:
382
383 real_extension=utils.get_extension(n['objURL'])
384
385 req = urllib2.Request(n['objURL'],headers=i_headers)
386
387 resp = urllib2.urlopen(req,None,5)
388
389 dataimg=resp.read()
390
391 name=str(uuid.uuid1())
392
393 filename=""
394
395 if len(real_extension)>4:
396
397 real_extension=".gif"
398
399 real_extension=real_extension.lower()
400
401 if real_extension==".gif":
402
403 filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
404
405 self.count+=1
406
407 else:
408
409 filename =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
410
411 self.count+=1
412
413 """
414
415 name=str(uuid.uuid1())
416
417 filename=""
418
419 if len(real_extension)>4:
420
421 real_extension=".gif"
422
423 filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424
425 self.count+=1
426
427 """
428
429 try:
430
431 if not os.path.exists(filename):
432
433 file_object = open(filename,'w+b')
434
435 file_object.write(dataimg)
436
437 file_object.close()
438
439 self.anaylis_info(n,filename,real_extension) #入库操作
440
441 else:
442
443 print "file exist"
444
445 except IOError,e1:
446
447 print "e1=",e1
448
449 pass
450
451 self.unlock()
452
453 except IOError,e2:
454
455 #print "e2=",e2
456
457 pass
458
459 self.chance1+=1
460
461 except Exception as parse_error:
462
463 print "parse_error",parse_error
464
465 pass
466
467
468
469 def title_dealwith(self,title):
470
471
472
473 #print "title",title
474
475 a=title.find("<strong>")
476
477 temp1=title[0:a]
478
479 b=title.find("</strong>")
480
481 temp2=title[a+8:b]
482
483 temp3=title[b+9:len(title)]
484
485 return (temp1+temp2+temp3).strip()
486
487
488
489 def anaylis_info(self,n,filename,real_extension):
490
491 print "success."
492
493
494
495 #if self.wait_ana_queue.qsize()!=0:
496
497 #n,filename,real_extension=self.wait.ana_queue.get()
498
499 #self.lock()
500
501 objURL=n['objURL'] #图片地址
502
503 fromURLHost=n['fromURLHost'] #来源网站
504
505 width=n['width'] #宽度
506
507 height=n['height'] #高度
508
509 di=n['di'] #用来唯一标识
510
511 type=n['type'] #格式
512
513 fromPageTitle=n['fromPageTitle'] #来自网站
514
515 keyword=self.title_dealwith(fromPageTitle)
516
517 cs=n['cs'] #未知
518
519 os=n['os'] #未知
520
521 temp = time.time()
522
523 x = time.localtime(float(temp))
524
525 acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间
526
527 self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))
528
529 y = self.dbcurr.fetchone()
530
531 if not y:
532
533 print 'add pic',filename
534
535 self.commit_count+=1
536
537 self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538
539 if self.commit_count==10:
540
541 self.dbconn.commit()
542
543 self.commit_count=0
544
545 #self.unlock()
546
547
548
549
550
551 def format_top_url(self,word,pn,rn):
552
553
554
555 url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8')
556
557 return url
558
559
560
561 def how_many(self,data):
562
563 try:
564
565 ipdata = json.loads(data)
566
567 if ipdata['displayNum']>0:
568
569 how_many=ipdata['displayNum']
570
571 return int(how_many)
572
573 else:
574
575 return 0
576
577 except Exception as e:
578
579 pass
580
581
582
583 def get_pic(self):
584
585 """
586
587 word="gif"
588
589 pn=0
590
591 rn=24
592
593 if self.key_word_queue.qsize()!=0:
594
595 word,pn,rn=self.key_word_queue.get()
596
597 url=self.format_top_url(word,pn,rn)
598
599 global proxy
600
601 if url:
602
603 try:
604
605 html=""
606
607 try:
608
609 req = urllib2.Request(url,headers=i_headers)
610
611 response = urllib2.urlopen(req, None,5)
612
613 #print "url",url
614
615 html = self.browser.openurl(url).read()
616
617 except Exception as err:
618
619 print "err",err
620
621 #pass
622
623 if html:
624
625 how_many=self.how_many(html)
626
627 #how_many=10000
628
629 print "how_many",how_many
630
631 word=self.get_para(url,"word")
632
633 rn=int(self.get_para(url,"rn"))
634
635 t=math.ceil(how_many/rn)
636
637 num = int(t)
638
639 for item in xrange(0,num-1):
640
641 """
642
643 try:
644
645 global proxy
646
647 print "size of queue",self.request_queue.qsize()
648
649 if self.request_queue.qsize()!=0:
650
651 id,word,page_num = self.request_queue.get()
652
653 u=self.format_top_url(word,page_num,24)
654
655 self.lock()
656
657 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))
658
659 self.dbconn.commit()
660
661 if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理
662
663 if self.ID % 100==0:
664
665 self.dbcurr.execute("select count(*) from proxy")
666
667 for r in self.dbcurr:
668
669 count=r[0]
670
671 if self.ID>count:
672
673 self.ID=50
674
675 self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))
676
677 results = self.dbcurr.fetchall()
678
679 for r in results:
680
681 protocol=r[1]
682
683 ip=r[2]
684
685 port=r[3]
686
687 pro=(protocol,ip+":"+port)
688
689 if pro not in self.next_proxy_set:
690
691 self.next_proxy_set.add(pro)
692
693 self.chance=0
694
695 self.chance1=0
696
697 self.ID+=1
698
699 self.unlock()
700
701 proxy_support = urllib2.ProxyHandler(proxy)
702
703 opener = urllib2.build_opener(proxy_support)
704
705 urllib2.install_opener(opener)
706
707 html=""
708
709 try:
710
711 req = urllib2.Request(u,headers=i_headers)
712
713 #print "u=",u
714
715 response = urllib2.urlopen(req, None,5)
716
717 html = response.read()
718
719 if html:
720
721 #print "html",type(html)
722
723 self.parse_json(html)
724
725 except Exception as ex1:
726
727 #print "error=",ex1
728
729 pass
730
731 self.chance+=1
732
733 if self.chance>0 or self.chance1>1:
734
735 if len(self.next_proxy_set)>0:
736
737 protocol,socket=self.next_proxy_set.pop()
738
739 proxy= {protocol:socket}
740
741 print "change proxy finished<<",proxy,self.ID
742
743 except Exception as e:
744
745 print "error1",e
746
747 pass
748
749
750
751 if __name__ == '__main__':
752
753
754
755 app = BaiduImage()
756
757 app.start_work(80)
758
759 #app.generateSeed()
760
761 while 1:
762
763 pass
2
3 """
4
5 Created on 2015-9-17
6
7
8
9 @author: huangxie
10
11 """
12
13 import time,math,os,re,urllib,urllib2,cookielib
14
15 from bs4 import BeautifulSoup
16
17 import time
18
19 import re
20
21 import uuid
22
23 import json
24
25 from threading import Thread
26
27 from Queue import Queue
28
29 import MySQLdb as mdb
30
31 import sys
32
33 import threading
34
35 import utils
36
37 import imitate_browser
38
39 from MySQLdb.constants.REFRESH import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding('utf-8')
44
45
46
47 DB_HOST = '127.0.0.1'
48
49 DB_USER = 'root'
50
51 DB_PASS = 'root'
52
53 proxy = {u'http':u'222.39.64.13:8118'}
54
55 TOP_URL="http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}"
56
57 KEYWORD_URL="https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}"
58
59
60
61 """
62
63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65 'Accept':'json;q=0.9,*/*;q=0.8',
66
67 'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69 'Accept-Encoding':'gzip',
70
71 'Connection':'close',
72
73 'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
74
75 }
76
77 """
78
79 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
80
81
82
83 def GetDateString():
84
85 x = time.localtime(time.time())
86
87 foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
88
89 return foldername
90
91
92
93 class BaiduImage(threading.Thread):
94
95
96
97 def __init__(self):
98
99 Thread.__init__(self)
100
101 self.browser=imitate_browser.BrowserBase()
102
103 self.chance=0
104
105 self.chance1=0
106
107 self.request_queue=Queue()
108
109 self.wait_ana_queue=Queue()
110
111 #self.key_word_queue.put((("动态图", 0, 24)))
112
113 self.count=0
114
115 self.mutex = threading.RLock() #可重入锁,使单线程可以再次获得已经获得的锁
116
117 self.commit_count=0
118
119 self.ID=500
120
121 self.next_proxy_set = set()
122
123 self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'sosogif', charset='utf8')
124
125 self.dbconn.autocommit(False)
126
127 self.dbcurr = self.dbconn.cursor()
128
129 self.dbcurr.execute('SET NAMES utf8')
130
131
132
133 """
134
135 def run(self):
136
137 while True:
138
139 self.get_pic()
140
141 """
142
143
144
145 def work(self,item):
146
147 print "start thread",item
148
149 while True: #MAX_REQUEST条以上则等待
150
151 self.get_pic()
152
153 self.prepare_request()
154
155
156
157 def format_keyword_url(self,keyword):
158
159
160
161 return KEYWORD_URL.format(wd=keyword).encode('utf-8')
162
163
164
165 def generateSeed(self,url):
166
167
168
169 html = self.browser.openurl(url).read()
170
171 if html:
172
173 try:
174
175 soup = BeautifulSoup(html)
176
177 trs = soup.find('div', id='rs').find('table').find_all('tr') #获得所有行
178
179 for tr in trs:
180
181 ths=tr.find_all('th')
182
183 for th in ths:
184
185 a=th.find_all('a')[0]
186
187 keyword=a.text.strip()
188
189 if "动态图" in keyword or "gif" in keyword:
190
191 print "keyword",keyword
192
193 self.dbcurr.execute('select id from info where word=%s',(keyword))
194
195 y = self.dbcurr.fetchone()
196
197 if not y:
198
199 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)',(keyword))
200
201 self.dbconn.commit()
202
203 except:
204
205 pass
206
207
208
209
210
211 def prepare_request(self):
212
213 self.lock()
214
215 self.dbcurr.execute('select * from info where status=0')
216
217 result = self.dbcurr.fetchone()
218
219 if result:
220
221 id,word,status,page_num,left_num,how_many=result
222
223 self.request_queue.put((id,word,page_num))
224
225 if page_num==0 and left_num==0 and how_many==0:
226
227 url=self.format_keyword_url(word)
228
229 self.generateSeed(url)
230
231 html=""
232
233 try:
234
235 url=self.format_top_url(word, page_num, 24)
236
237 html = self.browser.openurl(url).read()
238
239 except Exception as err:
240
241 print "err",err
242
243 #pass
244
245 if html!="":
246
247 how_many=self.how_many(html)
248
249 print "how_many",how_many
250
251 if how_many==None:
252
253 how_many=0
254
255 t=math.ceil(how_many/24*100) #只要前1/100即可
256
257 num = int(t)
258
259 for i in xrange(0,num-1):
260
261 self.dbcurr.execute('INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)',(word,0,i*24,num-i,how_many))
262
263 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id)) #置为已经访问
264
265 self.dbconn.commit()
266
267 self.unlock()
268
269
270
271
272
273 def start_work(self,req_max):
274
275 for item in xrange(req_max):
276
277 t = threading.Thread(target=self.work, args=(item,))
278
279 t.setDaemon(True)
280
281 t.start()
282
283
284
285 def lock(self): #加锁
286
287 self.mutex.acquire()
288
289
290
291 def unlock(self): #解锁
292
293 self.mutex.release()
294
295
296
297 def get_para(self,url,key):
298
299 values = url.split('?')[-1]
300
301 for key_value in values.split('&'):
302
303 value=key_value.split('=')
304
305 if value[0]==key:
306
307 return value[1]
308
309 return None
310
311
312
313 def makeDateFolder( self,par,child):
314
315 #self.lock()
316
317 if os.path.isdir( par ):
318
319 path=par + '//' + GetDateString()
320
321 newFolderName = path+'//'+child
322
323 if not os.path.isdir(path):
324
325 os.mkdir(path)
326
327 if not os.path.isdir( newFolderName ):
328
329 os.mkdir( newFolderName )
330
331 return newFolderName
332
333 else:
334
335 return par
336
337 #self.unlock()
338
339
340
341 def parse_json(self,data):
342
343
344
345 ipdata = json.loads(data)
346
347 try:
348
349 if ipdata['imgs']:
350
351 for n in ipdata['imgs']: #data子项
352
353 if n['objURL']:
354
355 try:
356
357 proxy_support = urllib2.ProxyHandler(proxy)
358
359 opener = urllib2.build_opener(proxy_support)
360
361 urllib2.install_opener(opener)
362
363 #print "proxy",proxy
364
365 self.lock()
366
367 self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
368
369 y = self.dbcurr.fetchone()
370
371 #print "y=",y
372
373 if y:
374
375 print "database exist"
376
377 self.unlock() #continue 前解锁
378
379 continue
380
381 else:
382
383 real_extension=utils.get_extension(n['objURL'])
384
385 req = urllib2.Request(n['objURL'],headers=i_headers)
386
387 resp = urllib2.urlopen(req,None,5)
388
389 dataimg=resp.read()
390
391 name=str(uuid.uuid1())
392
393 filename=""
394
395 if len(real_extension)>4:
396
397 real_extension=".gif"
398
399 real_extension=real_extension.lower()
400
401 if real_extension==".gif":
402
403 filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
404
405 self.count+=1
406
407 else:
408
409 filename =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
410
411 self.count+=1
412
413 """
414
415 name=str(uuid.uuid1())
416
417 filename=""
418
419 if len(real_extension)>4:
420
421 real_extension=".gif"
422
423 filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424
425 self.count+=1
426
427 """
428
429 try:
430
431 if not os.path.exists(filename):
432
433 file_object = open(filename,'w+b')
434
435 file_object.write(dataimg)
436
437 file_object.close()
438
439 self.anaylis_info(n,filename,real_extension) #入库操作
440
441 else:
442
443 print "file exist"
444
445 except IOError,e1:
446
447 print "e1=",e1
448
449 pass
450
451 self.unlock()
452
453 except IOError,e2:
454
455 #print "e2=",e2
456
457 pass
458
459 self.chance1+=1
460
461 except Exception as parse_error:
462
463 print "parse_error",parse_error
464
465 pass
466
467
468
469 def title_dealwith(self,title):
470
471
472
473 #print "title",title
474
475 a=title.find("<strong>")
476
477 temp1=title[0:a]
478
479 b=title.find("</strong>")
480
481 temp2=title[a+8:b]
482
483 temp3=title[b+9:len(title)]
484
485 return (temp1+temp2+temp3).strip()
486
487
488
489 def anaylis_info(self,n,filename,real_extension):
490
491 print "success."
492
493
494
495 #if self.wait_ana_queue.qsize()!=0:
496
497 #n,filename,real_extension=self.wait.ana_queue.get()
498
499 #self.lock()
500
501 objURL=n['objURL'] #图片地址
502
503 fromURLHost=n['fromURLHost'] #来源网站
504
505 width=n['width'] #宽度
506
507 height=n['height'] #高度
508
509 di=n['di'] #用来唯一标识
510
511 type=n['type'] #格式
512
513 fromPageTitle=n['fromPageTitle'] #来自网站
514
515 keyword=self.title_dealwith(fromPageTitle)
516
517 cs=n['cs'] #未知
518
519 os=n['os'] #未知
520
521 temp = time.time()
522
523 x = time.localtime(float(temp))
524
525 acTime = time.strftime("%Y-%m-%d %H:%M:%S",x) #爬取时间
526
527 self.dbcurr.execute('select ID from pic_info where cs=%s', (cs))
528
529 y = self.dbcurr.fetchone()
530
531 if not y:
532
533 print 'add pic',filename
534
535 self.commit_count+=1
536
537 self.dbcurr.execute('INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538
539 if self.commit_count==10:
540
541 self.dbconn.commit()
542
543 self.commit_count=0
544
545 #self.unlock()
546
547
548
549
550
551 def format_top_url(self,word,pn,rn):
552
553
554
555 url = TOP_URL.format(word=word, pn=pn,rn=rn).encode('utf-8')
556
557 return url
558
559
560
561 def how_many(self,data):
562
563 try:
564
565 ipdata = json.loads(data)
566
567 if ipdata['displayNum']>0:
568
569 how_many=ipdata['displayNum']
570
571 return int(how_many)
572
573 else:
574
575 return 0
576
577 except Exception as e:
578
579 pass
580
581
582
583 def get_pic(self):
584
585 """
586
587 word="gif"
588
589 pn=0
590
591 rn=24
592
593 if self.key_word_queue.qsize()!=0:
594
595 word,pn,rn=self.key_word_queue.get()
596
597 url=self.format_top_url(word,pn,rn)
598
599 global proxy
600
601 if url:
602
603 try:
604
605 html=""
606
607 try:
608
609 req = urllib2.Request(url,headers=i_headers)
610
611 response = urllib2.urlopen(req, None,5)
612
613 #print "url",url
614
615 html = self.browser.openurl(url).read()
616
617 except Exception as err:
618
619 print "err",err
620
621 #pass
622
623 if html:
624
625 how_many=self.how_many(html)
626
627 #how_many=10000
628
629 print "how_many",how_many
630
631 word=self.get_para(url,"word")
632
633 rn=int(self.get_para(url,"rn"))
634
635 t=math.ceil(how_many/rn)
636
637 num = int(t)
638
639 for item in xrange(0,num-1):
640
641 """
642
643 try:
644
645 global proxy
646
647 print "size of queue",self.request_queue.qsize()
648
649 if self.request_queue.qsize()!=0:
650
651 id,word,page_num = self.request_queue.get()
652
653 u=self.format_top_url(word,page_num,24)
654
655 self.lock()
656
657 self.dbcurr.execute('update info SET status=1 WHERE id=%s',(id))
658
659 self.dbconn.commit()
660
661 if self.chance >0 or self.chance1>1: #任何一个出问题都给换代理
662
663 if self.ID % 100==0:
664
665 self.dbcurr.execute("select count(*) from proxy")
666
667 for r in self.dbcurr:
668
669 count=r[0]
670
671 if self.ID>count:
672
673 self.ID=50
674
675 self.dbcurr.execute("select * from proxy where ID=%s",(self.ID))
676
677 results = self.dbcurr.fetchall()
678
679 for r in results:
680
681 protocol=r[1]
682
683 ip=r[2]
684
685 port=r[3]
686
687 pro=(protocol,ip+":"+port)
688
689 if pro not in self.next_proxy_set:
690
691 self.next_proxy_set.add(pro)
692
693 self.chance=0
694
695 self.chance1=0
696
697 self.ID+=1
698
699 self.unlock()
700
701 proxy_support = urllib2.ProxyHandler(proxy)
702
703 opener = urllib2.build_opener(proxy_support)
704
705 urllib2.install_opener(opener)
706
707 html=""
708
709 try:
710
711 req = urllib2.Request(u,headers=i_headers)
712
713 #print "u=",u
714
715 response = urllib2.urlopen(req, None,5)
716
717 html = response.read()
718
719 if html:
720
721 #print "html",type(html)
722
723 self.parse_json(html)
724
725 except Exception as ex1:
726
727 #print "error=",ex1
728
729 pass
730
731 self.chance+=1
732
733 if self.chance>0 or self.chance1>1:
734
735 if len(self.next_proxy_set)>0:
736
737 protocol,socket=self.next_proxy_set.pop()
738
739 proxy= {protocol:socket}
740
741 print "change proxy finished<<",proxy,self.ID
742
743 except Exception as e:
744
745 print "error1",e
746
747 pass
748
749
750
751 if __name__ == '__main__':
752
753
754
755 app = BaiduImage()
756
757 app.start_work(80)
758
759 #app.generateSeed()
760
761 while 1:
762
763 pass
若转载请注明出处!若有疑问,请回复交流!