Python入门-编写抓取网站图片的爬虫-正则表达式

//生命太短 我用Python!

//Python真是让一直用c++的村里孩子长知识了!

 

这个仅仅是一个测试,成功抓取了某网站1000多张图片。

下一步要做一个大新闻 大工程

 

 

 

  1 #config = utf-8
  2 
  3 import urllib
  4 import urllib2
  5 import re
  6 import os
  7 
  8 global CNT
  9 CNT = 0
 10 
 11 def getHtml(url):
 12     
 13 #! /usr/bin/env python
 14     # -*- coding=utf-8 -*- 
 15     # @Author pythontab.com
 16     #url="http://pythontab.com"
 17     req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 18     'Accept':'text/html;q=0.9,*/*;q=0.8',
 19     'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 20     'Accept-Encoding':'gzip',
 21     'Connection':'close',
 22     'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
 23     }
 24     req_header_2 = {
 25     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0'
 26     }
 27     
 28     req_timeout = 5
 29     #status = urllib.urlopen(url).code
 30     #print status
 31     #if status != 200:
 32     #    print 'Http Error %s' % status
 33     #    return False
 34     
 35     req = urllib2.Request(url,None,req_header_2)
 36     resp = urllib2.urlopen(req,None,req_timeout)
 37     html = resp.read()
 38     return html
 39 
 40 def getAllUrl(html):
 41     reg = r'<a href="(.+)" target='
 42     theurl = re.compile(reg)
 43     url = re.findall(theurl,html)
 44     return url
 45 
 46 def getNext(html):
 47     reg = r"<a href='.+pai/(.+).html'"
 48     nxtre = re.compile(reg)
 49     nxt = re.findall(nxtre,html)
 50     return nxt[0]
 51 
 52 def getName(html):
 53     reg = r'<title>(.+)</title>'
 54     nare = re.compile(reg)
 55     name = re.findall(nare,html)
 56     return name[0]
 57 
 58 def getImg(name,html):
 59     global CNT
 60     
 61     reg = r'<img src="(.{0,80}\.jpg)" border="0"'
 62     imgre = re.compile(reg)
 63     imglist = re.findall(imgre,html)
 64     
 65     reg = r'src="(.{0,80}\.jpeg)" border'
 66     imgre = re.compile(reg)
 67     imglist.extend(re.findall(imgre,html))
 68 
 69     reg = r"<img src='(.{0,80}\.jpg)'"
 70     imgre = re.compile(reg)
 71     imglist.extend(re.findall(imgre,html))
 72 
 73     reg = r"<img src='(.{0,80}\.jepg)'"
 74     imgre = re.compile(reg)
 75     imglist.extend(re.findall(imgre,html))
 76 
 77     local = '.\%s-[%sp]' % (name,len(imglist))
 78     if os.path.exists(unicode(local,'utf-8')):
 79         return unicode(local,'utf-8')+u'was existed'
 80     
 81     os.mkdir(unicode(local,'utf-8'))
 82 
 83     x = 0  
 84     for imgurl in imglist:
 85         print imgurl
 86         urllib.urlretrieve(imgurl,unicode(local+'\%s.jpg' % x,'utf-8'))
 87         x+=1
 88         CNT+=1
 89         
 90     return unicode('%s: get %s pthoto(s)' % (name,x),'utf-8')
 91 
 92 
 93 
 94 def getAll(num):
 95     global CNT
 96     nxt = 164680
 97     while num > 0:
 98         
 99         url = '---%s.html' % nxt
100         print nxt
101         html = getHtml(url)
102         nxt -= 1
103         num -= 1
104         if html == False:
105             print 'Error'
106             continue
107         
108         print getImg(getName(html),html)
109     
110     return 'done! %s photos!' % str(CNT)
111 
112 def getAll_update(index):
113     global CNT
114     num = CNT
115     urls = getAllUrl(getHtml(index))
116     
117     for url in urls:
118         html = getHtml('---'+url)
119         print getImg(getName(html),html)
120     return 'done! %s photos!' % str(CNT-num)
121 
122 
123 #print getAll(10)
124 #html = getHtml('---')
125 #print getNext(html)
126 
127 x = 3
128 while x < 50:
129     print getAll_update('---' % x)
130     x+=1
131 
132 #print getAll_update('---')

 

header 伪装成浏览器

正则表达式 http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html    //我也是刚刚学

基本都是一路百度写出来的

posted @ 2015-08-12 23:37  Helica  阅读(1103)  评论(0编辑  收藏  举报