Web Crawl for Pictures

Python 3.6.3

 

myConfig.py

 1 '''
 2 说明
 3 '''
 4 # 被爬取的网站
 5 homePageUrl = 'http://bbs.fengniao.com'
 6 
 7 # 存储图片的路径
 8 imgFolder = 'C:/L/workspace/FirstPython/src/1809 - PetPhoto/img/'
 9 
10 # 论坛编号
11 forumNum = '30' # 论坛编号
12 
13 # 开始页码、结束页码
14 pageBegin = 1 # 开始页码
15 pageEnd = 2 # 结束页码
View Code

  

index.py

 1 '''
 2 
 3 # 抓取《蜂鸟网》【宠物摄影】栏目的宠物图片
 4 
 5 - - - - - - - - - - - - - - - - - - - - - 
 6 如果本地测试,不要使用网页另存为,而是要使用打开网页中右键:查看源码->复制、粘贴
 7 '''
 8 import myConfig
 9 import myList
10 
11 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
12 # main
13 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
14 if __name__ == '__main__':
15     
16     # 循环所有贴子页面
17     for pageNum in range( myConfig.pageBegin, myConfig.pageEnd + 1 ):
18         
19         # 网址格式插入的字符串
20         # 首页和以后的页面url格式不太一样
21         if( pageNum == 1 ):
22             s = ''
23         else:
24             s = '_%s_lastpost'% str( pageNum )
25     
26         # 网址
27         url = 'http://bbs.fengniao.com/forum/forum_%s%s.html' % ( myConfig.forumNum, s )
28         # 生成的网址示例如下:
29         # http://bbs.fengniao.com/forum/forum_30.html
30         # http://bbs.fengniao.com/forum/forum_30_2_lastpost.html
31             
32         # 处理每一个列表页
33         myList.eachList( url )
34         
View Code

 

myList.py

  1 '''
  2 《我的列表》
  3 - - - - - - - - - - 
  4 - 列表,通常数据页面的汇总页,如文章列表、帖子列表
  5 - 在这个页面里,我们要整理出来所有页面的一个清单,所以它的下级处理环节就是 myPage(我的页面)
  6 - 它的上级页面通常是 index(缺省页,主要做些准备工作)
  7 - 要有一个支持本地文件调试的节点,通常取名为:pageHtml( a, b, htmlCode )
  8 - 2018-0928 Livon
  9 '''
 10 import re
 11 import os
 12 import urllib.request
 13 import myConfig
 14 import myPage
 15 
 16 '''
 17 每一个列表页
 18 '''
 19 def eachList( listPageUrl ):
 20     
 21     print( listPageUrl )
 22     
 23     # 获取一个 list 页
 24     htmlResponse = urllib.request.urlopen( listPageUrl )
 25     html = htmlResponse.read()
 26     html = html.decode('utf8')
 27     
 28     # 解析 html 代码
 29     listPageHtml( html )
 30     
 31 ''' 
 32 解析列表页的 html 代码
 33 '''
 34 def listPageHtml( html ):
 35     
 36     # 帖子列表区域
 37     arr_post_list_ul = re.findall("<ul class=\"txtList\">((?:.|\n)*?)</ul>", html )
 38     
 39     if( len( arr_post_list_ul ) < 1 ):
 40         print('未发现列表区域,程序终止。')
 41         return
 42     
 43     # 打印每个区域,仅仅为了查看、调试
 44     for i in range( 0, len( arr_post_list_ul )):
 45         print('arr_post_list_ul : ' + arr_post_list_ul[i] )
 46         
 47     print('该页列表区域数量:')
 48     print( len( arr_post_list_ul ) )
 49     
 50     # 第1页有 2 个区域,以后页只有一个区域,共性是:都是最后一个区域是期望的内容
 51     ul = arr_post_list_ul[ len(arr_post_list_ul) - 1 ]
 52     
 53     # 帖子列表项
 54     arr_post_list_li = re.findall("<li  >((?:.|\n)*?)</li>", ul )
 55 
 56     # 处理每个帖子    
 57     for i in range( 0, len( arr_post_list_li ) ):
 58         post_list_li( i, arr_post_list_li[i] )
 59     
 60     
 61 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 62 # 处理一个帖子
 63 参数:
 64     i: 序号;li: 每一个 post 放在一个 li 元素中
 65 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
 66 def post_list_li( i, li ):
 67     
 68     # 数组:存放所有帖子的地址
 69     postPageList = []
 70     
 71     # 序号
 72     print( str( i + 1 ) + ' - arr_post_list_li : ' )
 73     print( li )
 74     
 75     # 时间
 76     time = re.findall('<span class="time">(.*?)</span>', li )
 77     print( ' - 时间 : ' + time[0] )
 78     
 79     # 作者
 80     author = re.findall('<a class="username" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a>', li )
 81     print( ' - 作者 : ' + author[0][1] )
 82     print( ' - 主页 : ' + author[0][0] )
 83     
 84     # 主题
 85     title = re.findall('<a href="(.*?)" target="_blank" title="(.*?)" class="tit(.*?)" style=\'\'>(.*?)</a>', li )
 86     print( ' - 帖子Url : ' + myConfig.homePageUrl + title[0][0] )
 87     print( ' - 帖子Title : ' + title[0][1] )
 88     
 89     # 添加网址
 90     postPageList.append( myConfig.homePageUrl + title[0][0] )
 91     
 92     # 处理子网址(一个帖子可能有多个页面,也就是多个网址)
 93     sub_post = re.findall('<span>\(</span>(.*?)<span>\)</span>', li )
 94     
 95     # 是否存在多于一个的子网页
 96     if( len(sub_post) > 0 ):
 97         
 98         # 存在
 99         postPageList.clear() # 清空列表,重新 append(每1次)
100         print( ' - 子页面 Url(第一个子页面与帖子页面内容相同): ' )
101         
102         # 查找所有子网址
103         arr_url = re.findall('<a href="(.*?)" target="_blank">(.*?)</a>', sub_post[0] )
104         for url in arr_url:
105             print( url[1] + ' - ' + myConfig.homePageUrl + url[0] )
106             
107             # 添加每一个子网址
108             postPageList.append( myConfig.homePageUrl + url[0] )
109             
110             # 是否有“最后一页”的链接? 
111             if( url[1] == '最后一页' ):
112                 
113                 # 有,之前保留的网址全部作废,重新整理
114                 postPageList.clear() # 清空列表,重新 append(每2次)
115                 
116                 # 共有?页
117                 pageCount = int( url[0][16:-5])
118                 print( ' 共有页数: ' + str(pageCount) )
119                 for pageNum in range( 0, pageCount ):
120                     print( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] )
121                     # 添加每一个子网址
122                     postPageList.append( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] )
123                     
124     # 列表页的缩略图(该列表页仅显示前5张)        
125 #     regularExpress = 'style="background-image\:url\((.*?)\?imageView2/2/w/400/q/90/ignore-error/1/\)"></a>'
126 #     arr_pic = re.findall( regularExpress, li )
127 #     print( ' - 列表页中的贴子图片(取前5张): ' )
128 #     for pic in arr_pic:
129 #         print( ' - ' + pic )
130         
131     # 存储图片的文件夹
132     folderName = '%s《%s》- %s' % ( convertTime( time[0] ), title[0][1], author[0][1] )
133     # folderName 示例:2018-0924-0642《萌萌哒土拨鼠》-美时美摄
134     
135     path = myConfig.imgFolder + folderName
136     print( '存储路径 ====> ' + path )
137     
138     # 如果目录已经存在,说明之前曾经爬取过,如果想重新获取,请手工删除该目录
139     if( os.path.exists( path ) ):
140         # 目录已经存在
141         print( 'ERROR : 目录已经存在,应该是之前曾经获取过,如果想重新抓取,请手动删除该目录。' )
142     else:
143         # 目录不存在,创建目录
144         os.makedirs( path ) # 如果不存在,则会创建目录
145         
146         # 处理该帖子的所有页面,也可以说是获取该文件夹下的图片
147         myPage.postPageList( folderName, postPageList )
148     
149     
150             
151             
152 
153 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
154 # 转换时间格式
155 由:2018-09-29 10:15:25
156 转成:2018-0929-1015
157 目的:转成合法的目录名
158 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
159 def convertTime( t ):
160     
161     t = '%s-%s-%s' % ( t[:4], t[4:10].replace('-',''), t[11:16].replace( ':', '' ))
162     return t
163         
164     
165     
166 
167 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
168 # main
169 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
170 if __name__ == '__main__':
171     print( convertTime('2018-09-29 10:15:25'))
查看源码

 

myPage.py

 1 '''
 2 《我的页面》
 3 - - - - - - - - - - 
 4 - 页面,一般指具体的数据所在页面,如果是文章就是文章页,如果是论坛就是帖子页(通常一个帖子有多个页面)
 5 - 在这个页面里,有我们要获取的数据,所以通常这个页面的下级处理环节就是 myData(我的数据)
 6 - 上级处理环节就是 myList(我的列表)
 7 - 通常进入该页面时,会带有N个页面的地址,要逐个页面进行处理
 8 - 要有一个支持本地文件调试的节点,通常取名为:pageHtml( a, b, htmlCode )
 9 - 最后通常是处理每一个数据对象的方法,这里是处理每一张图片
10 - 2018-0928 Livon
11 '''
12 
13 import re
14 import urllib.request
15 import myConfig
16 import myData
17 
18 
19 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
20     每一个 post(帖子),都有至少一个 postPage(贴子页面)
21 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
22 def postPageList( folderName, postPageList ):
23     
24     # 处理每一个页面的 URL 地址
25     for pageNum in range( 0, len( postPageList ) ):
26         
27         # 请求帖子页面,可能只有一页,也可能是一个子页
28         htmlResponse = urllib.request.urlopen( postPageList[pageNum] )
29         html = htmlResponse.read()
30         html = html.decode('utf8')
31         
32         # 处理请求到的 html 代码
33         postPageHtml( folderName, pageNum, html )
34         
35         
36         
37     
38 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
39     处理页面 html
40     本地 html 文件测试时,调用该方法
41 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
42 def postPageHtml( folderName, pageNum, html ):
43     
44     # 页面中的所有图片数组
45     regularExpression = '<img src="(.*?)\?imageView2/2/w/1024/q/90/ignore-error/1/">'
46     arr_picUrl = re.findall( regularExpression, html )
47 
48     # 依次处理每张图片
49     for i in range( 0, len( arr_picUrl )):
50         print( '序号:%s - 图片地址:%s' % ( str(i+1), arr_picUrl[i] ))
51         eachImg( folderName, pageNum, i, arr_picUrl[i] )
52     
53     
54     
55     
56 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
57     逐个处理每一张图片
58     参数:目录名、帖子的第N页、该页中的第N张、图片Url
59 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 
60 def eachImg( folderName, pageNum, i, imgUrl ):
61     
62     # 原图
63     file = '%s%s/%s-%s.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i))
64     myData.crawl( file, imgUrl ) # 爬取
65     
66     # 缩略图
67     file = '%s%s/%s-%s_small.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i))
68     smallImgUrl = imgUrl + '?imageView2/2/w/1024/q/90/ignore-error/1/'
69     myData.crawl( file, smallImgUrl ) # 爬取
70     
71     
myPage.py

 

myData.py

 1 import urllib.request
 2 
 3 '''
 4     爬取数据
 5   - - - - - - - - - - - - - - - - 
 6     参数:文件(路径+文件名)、图片Url
 7 '''
 8 def crawl( file, url ):
 9     
10     try:
11         img = urllib.request.urlopen( url )
12     except urllib.error.HTTPError as e:
13         print( e.reason )
14     else:
15         fp = open( file,'wb') # 以 byte(二进制)方式写入
16         fp.write( img.read() )
17         fp.close()
18         print( "爬取数据成功" )
19     
20 
21         
22 '''
23 测试
24 '''
25 # if __name__ == '__main__':
26 #     
27 #     postNum = '001'
28 
29     
View Code

 

posted on 2018-09-29 16:22  Livon  阅读(318)  评论(0编辑  收藏  举报

导航