python 简易小爬虫
此脚本用于爬站点的下载链接,最终输出到txt文档中。
如果是没有防盗链设置的站点,也可以使用脚本中的下载函数尝试直接下载。
本脚本是为了短期特定目标设计的,如果使用它爬其它特征的资源链接需自行修改配置语句。
python初学者,请多多指正。
# -*- coding: utf-8 -*- import re import urllib import os import urllib2 import requests import time #download the file def download(page, url): local_filename =url.split('/')[-1] + page + '.jpg' r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size = 1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return local_filename #turn the data array into urls array def print_urls(urls): output_urls = [] for link in urls: start_link = link.find('"') end_link = link.find('"', start_link+1) output_link = link[start_link+1: end_link] if output_link.find('http') == -1: output_link = 'http://www.XXX.com' + output_link if link.count('"') > 2: continue else: output_urls.append(output_link) return output_urls def output_download_link_page(page): url = page s = urllib.urlopen(url).read() urls = [] img_urls = 'no image on' + page new_stl_urls = [] title = re.findall(r'<h1>.+<\/h1>', s, re.I) if len(title) != 0: title = title[0] else: title = 'no title' img_urls = print_urls(re.findall(r'href=".*?\.jpg.*?"', s, re.I)) if len(img_urls) != 0: img_urls = img_urls[0] else: img_urls = 'no image' + page stl_urls = print_urls (set(re.findall(r'href="/download/.*?"', s, re.I))) for url in stl_urls: #url = urllib2.urlopen(url).url url = requests.get(url).url new_stl_urls.append(url) urls.append(title) urls.append(img_urls) urls = urls + new_stl_urls return urls #print output_download_link_page('http://www.XXX.com/thing/46876') #output all links to download def output_all_pages(site): s = urllib.urlopen(site).read() page = re.findall(r'href="/thing/.*?"', s, re.I) page = set(page) return print_urls(page) #output all the sites to download def generate_sites(start, end): sites = [] for num in range(start, end): sites.append('http://www.XXX.com/popular?query=&pg=' + str(num)) return sites #write all the results to a txt file file_new = open ('1.txt', 'r+') url_pakage = [] sites = generate_sites(40, 46) count = 0 for site in sites: print site file_new.write( '\n' + site) pages = output_all_pages(site) for page in pages: urls = output_download_link_page(page) # if len(urls) >= 10: continue count = count + 1 for url in urls: file_new.write(url + '\n') print 'done' time.sleep(10) file_new.close() print 'all done. all..' + str(count) + '..models'
危楼高百尺,手可摘星辰。不敢高声语,恐惊天上人。