史高治

新随笔 联系 管理
协程:
 
import grequests
from fake_useragent import UserAgent
 
urls=[f'http://bir删d.so/search?page={page}&q=Python' for page in range(1,9)]
 
tasks=[grequests.get(url,headers={'User-Agent':UserAgent().random}) for url in urls]
grequests.map(tasks)
 
# import time,requests
# st=time.time()
# tasks=[requests.get(url,headers={'User-Agent':UserAgent().random}) for url in urls]
# print(time.time()-st)
**********************分割线**********************
多线程:同时执行多种主函数,或同1种主函数执行pages实参次
 
import time
from threading import Thread
 
def func1(m,n):
    print(m);time.sleep(4)
    print(m+n);time.sleep(1)
 
def func2(x=666):
    print(x);time.sleep(2)
    print('end of func2')
 
if __name__ == '__main__':
    Thread(target=func1,args=('中','外')).start()
    Thread(target=func2,args=('story',)).start()
**********************分割线**********************
线程池:
 
import threadpool
import requests,time
start_time=time.time()
 
def func(url):
    r=requests.get(url=url)
 
#线程池的方法makeRequests,类似于进程池的map
reqs=threadpool.makeRequests(func,['http://www.qq.com' for x in range(50)])
p=threadpool.ThreadPool(9)
[p.putRequest(x) for x in reqs]
p.wait()
print(f'用时:{time.time() - start_time}秒')
**********************分割线**********************
进程池:
 
from itertools import repeat
import requests, time
start_time = time.time()
 
def func(url, w='qwert'):
    r=requests.get(url=url)
    print(w)
 
#法1之用多进程库的镜像pathos,它的map的首参函数体可以用lambda,函参可以是多个
from pathos.multiprocessing import ProcessingPool
def main():
    p = ProcessingPool(nodes=8)
    p.map(func, ['http://www.example.com/']*5,['wo']*9)    #*args是各组参数列表
    #result=p.amap(pow, [4, 2, 3], [2, 5, 6]).get()    #list(p.imap(pow, [4, 2, 3], [2, 5, 6]))
 
#法2之使用原生多进程库的starmap方法
#from multiprocessing import Pool
#def main():
#     with Pool(processes=8) as p:    #函参依然在starmap的2参,不过不再局限于单组序列
#         #p.starmap(func,[('http://www.example.com/','wo')]*9)
#         p.starmap(func,zip(['http://www.example.com/']*9,repeat('wo')))
 
if __name__ == '__main__':  # 多进程要有此句,多线程和协程不必
    main()
    print(f'用时:{time.time() - start_time}秒')
 
map()的2参只能是单组序列。另外比如scrapy的item,下例的testUrl,三程一步所执行的首参函数,其用到的各变量得写在函数内,若为全局变量或致半途而废、张冠李戴。
****************************************分割线****************************************
Egの进程池从西刺代理获取当前可用于目标网站爬虫的各IP:
 
import os,re,time,random,requests,pandas
from pandas import DataFrame as DF
from fake_useragent import UserAgent
from multiprocessing import Pool
 
def csvAddTitle():
    title='国家,IP,端口,归属地,匿名,类型,速度,连接时间,存活时间,验证时间'.split(',')
    DF(columns=title).to_csv(collectedIP,mode='a+',index=False,encoding='gbk')
    DF(columns=title).to_csv('D:/usableIP.csv',mode='a+',index=False,encoding='gbk')
 
def collectingIP():
    #和西刺的标题字段一致;新浪不支持<,发到博客前正则里的<全替换为全角的<
    pattern=re.compile('alt="(Cn)".*?<td>(.*?)<.*?<td>(.*?)<.*?">(.*?)<.*?">(.*?)\
<.*?<td>(.*?)<.*?title="(.*?)".*?title="(.*?)".*?<td>(.*?)<.*?<td>(.*?)<',re.S)
    for page in range(1,9):
        h={'User-Agent':UserAgent().random}
        response=requests.get(f'http://www.xicidaili.com/wt/{page}',headers=h).text
        rows=pattern.findall(response)
        DF(rows).to_csv(collectedIP,mode='a+',index=False,header=False,encoding='gbk')
 
def testIP(series):
    time.sleep(random.random()/5)
    testUrl='http://www.example.com/'
    h={'User-Agent':UserAgent().random}
    ip=f"{series['类型'].lower()}://{series['IP']}:{series['端口']}"
    proxies={ip.split(':')[0]:ip}   #形如{'http':'http://59.110.46.8:4000'}
    try:
        if requests.get(testUrl,headers=h,proxies=proxies,timeout=2).status_code==200:
            print(f'{ip}:在此站当前可用')
            return series.to_frame().T
    except:
        print(f'{ip}:在此站无效')
 
def back(df):
    if not df is None:    #用if df:则报错value of a DataFrame is ambiguous
        df.to_csv('D:/usableIP.csv',mode='a+',index=False,header=False,encoding='gbk')
 
if __name__ == '__main__':
    collectedIP='D:/collectedIP.csv'
    if not os.path.isfile(collectedIP):
        csvAddTitle()
        collectingIP()
    #上面是采集若干ip,下面是验证这些ip在目标站点是否可用
    dfのCollect=pandas.read_csv(collectedIP,encoding='gbk')
    p=Pool(4)
    [p.apply_async(testIP,(row[1],),callback=back) for row in dfのCollect.iterrows()]
    p.close()
    p.join()
****************************************分割线****************************************
Egの进程池抓取meizitu网:
 
import requests,os,re
from multiprocessing import Pool
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
 
noName=r'[\\/:*?"<>|]'
 
def bs(response):
    return BeautifulSoup(response,'lxml')
 
def girlsUrl(indexUrl):
    html=requests.get(indexUrl).text
    soup=bs(html)
    girlsUrl=[x['href'] for x in soup.select('#pins li > a')]
    return girlsUrl
 
def imgsUrl(girlUrl):
    soup=bs(requests.get(girlUrl).text)
    title=soup.select('.main-title')[0].get_text(strip=True)
    title=re.sub(noName,' ',title).strip()
    path=os.path.join('E:/pictures/',title)
    if not os.path.isdir(path):os.makedirs(path)
    os.chdir(path)  #后文保存图片时,就不需再写存放的文件夹了
    num=int(soup.select('.pagenavi span')[-2].get_text(strip=True))
    imgsUrl=[girlUrl+f'/{page}' for page in range(1,num+1)]
    print(title,f':共{num}张',sep='')
    return imgsUrl
 
def downLoad(jpgUrl):
    soup=bs(requests.get(jpgUrl).text)
    realUrl=soup.select('.main-image img')[0]['src']
    imgName=realUrl.split('/')[-1]
    h={'User-Agent':UserAgent().random,'Referer':jpgUrl}
    imgContent=requests.get(realUrl,headers=h).content
    with open(imgName,'wb') as jpg:
        jpg.write(imgContent)
 
if __name__ == '__main__':
    indexUrl='http://www.mzitu.com/'
    for girlUrl in girlsUrl(indexUrl):
        #girlUrl='http://www.mzitu.com/89089'   #若单独下载某人的图集,注释掉上两行
        jpgsUrl=imgsUrl(girlUrl)  #函数imgsUrl在循环内,不要有变量和它同名,以免被覆盖
        p = Pool(8)
        p.map(downLoad,[jpgUrl for jpgUrl in jpgsUrl])
        p.close()
        p.join()
****************************************分割线****************************************
tornado示例里的一个高效异步爬虫:
 
import time
from datetime import timedelta
from tornado import httpclient,gen,ioloop,queues
import traceback
 
class AsySpider(object):
    """A simple class of asynchronous spider."""
    def __init__(self,urls,concurrency=10,results=None,**kwargs):
        urls.reverse()
        self.urls=urls
        self.concurrency=concurrency
        self._q=queues.Queue()
        self._fetching=set()
        self._fetched=set()
        if results is None:
            self.results=[]
 
    def fetch(self,url,**kwargs):
        fetch=getattr(httpclient.AsyncHTTPClient(),'fetch')
        return fetch(url,raise_error=False,**kwargs)
 
    def handle_html(self,url,html):
        """handle html page"""
        print(url)
 
    def handle_response(self,url,response):
        """inherit and rewrite this method if necessary"""
        if response.code==200:
            self.handle_html(url,response.body)
        elif response.code==599:   # retry
            self._fetching.remove(url)
            self._q.put(url)
 
    @gen.coroutine
    def get_page(self,url):
        try:
            response=yield self.fetch(url)
            #print('######fetched %s' % url)
        except Exception as e:
            print('Exception:%s %s' % (e,url))
            raise gen.Return(e)
        raise gen.Return(response)
 
    @gen.coroutine
    def _run(self):
 
        @gen.coroutine
        def fetch_url():
            current_url=yield self._q.get()
            try:
                if current_url in self._fetching:
                    return
                #print('fetching******%s' % current_url)
                self._fetching.add(current_url)
                response=yield self.get_page(current_url)
                self.handle_response(current_url,response)    # handle reponse
                self._fetched.add(current_url)
                for i in range(self.concurrency):
                    if self.urls:
                        yield self._q.put(self.urls.pop())
            finally:
                self._q.task_done()
 
        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()
        self._q.put(self.urls.pop())    # add first url
        # Start workers,then wait for the work queue to be empty.
        for _ in range(self.concurrency):
            worker()
        yield self._q.join(timeout=timedelta(seconds=300000))
        try:
            assert self._fetching==self._fetched
        except AssertionError:
            print(self._fetching-self._fetched)
            print(self._fetched-self._fetching)
 
    def run(self):
        io_loop=ioloop.IOLoop.current()
        io_loop.run_sync(self._run)
 
class MySpider(AsySpider):
 
    def fetch(self,url,**kwargs):
        """重写父类fetch方法可以添加cookies,headers等信息"""
        cookie="PHPSESSID=nms56ppb70jri4;think_language=zh-cn"
        headers={
            'User-Agent':'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
            'Cookie':cookie
        }
        return super(MySpider,self).fetch(url,headers=headers)
 
    def handle_html(self,url,html):
        #print(url,html)
        print(url)
 
def main():
    urls=[f'http://www.baidu.com/?page={page}' for page in range(1,10000)]
    s=MySpider(urls)
    s.run()
 
if __name__=='__main__':
    main()
****************************************分割线****************************************
异步爬京东的Python书籍:
 
from gevent import monkey;monkey.patch_all()
from gevent.pool import Pool
 
from openpyxl import Workbook
import requests,random,time
from fake_useragent import UserAgent
from lxml.html import fromstring    #from bs4 import BeautifulSoup
 
urls=[f'https://search.jd.com/Search?keyword=Python&page={2*x+1}' for x in range(10)]
rule='.gl-item'
#proxyList=['122.72.18.34:80', '175.5.44.79:808', '221.9.12.4:9000',]
 
def spiderAndParse(url,rule):
    time.sleep(random.random()/4)
    #p=random.choice(proxyList)    #proxies={'http':'http://'+p,'https':'https://'+p}
    h={'UserAgent':UserAgent().random}
    html=requests.get(url,headers=h).content.decode()
 
    lx=fromstring(html)    #soup=BeautifulSoup(html,'lxml');items=soup.select(rule)
    items=lx.cssselect(rule)
 
    for x in items:  #.text取自家文本;.text_content()取后代的返回lxml对象,加[:]后转为str
        href=f'https://item.jd.com/{x.get("data-sku")}.html'  #{}的子类,取属性要写.get()
        price=x.cssselect('strong> i')[0].text    #>+~它仨的右侧要有空格,左侧可有可无
        comments=x.cssselect('strong> a')[0].text
        name=x.cssselect('a em')[0].text_content()[:].strip()
        try:    #京东自营有author和date,但许多第三方没有
            author=x.cssselect('.p-bookdetails> span')[0].text_content()[:].replace(' 著','')
            date=x.cssselect('.p-bookdetails> span')[-1].text
        except:
            author,date=None,None
        ws.append([href,price,comments,date,name,author])
 
def main():
    p=Pool(size=8)
    [p.spawn(spiderParse,url,rule) for url in urls]   #首参是函数体,*args是各实参
    p.join()
 
if __name__ == '__main__':
    wb=Workbook();ws=wb.active;ws.append(['网址','价格','评论数','出版日期','书名','作者'])
    main()
    wb.save('F:/京东搜索书籍.xlsx')
****************************************分割线****************************************
Anti-SpiderのCustomFont:
自定义字体.woff,有的关联在动态网址中而且每秒一换如猫睛电影,有的在用IE打开其url后会自动存到本机缓存并几天一换如天睛查。缓存形式的, 若抓到的包中无Font文件(F12在Network的Font下,Fiddler是蓝底的A图案),则本机搜.woff, 通常是最近的或名字含错乱字的class属性值的那个.woff。 
用在线工具FontEditor(http://fontstore.baidu.com/static/editor/)打开此.woff,蓝字为key(16进制,即网页源代码所用的混淆字)黑字为value的对应关系来批替换所爬数据中的混淆字,若chr(int(下方的16进制词组去除uni等干扰后,16))≠上方对应的单字,则fontTools库对付不了。
 
CustomFontの天睛查:公司的注册资本、注册时间、核准日期
 
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from fontTools.ttLib import TTFont
from gevent import monkey;monkey.patch_all()
from gevent.pool import Pool
 
def fontCorrect(font=''):
    fontFile='D:/tyc-num.woff'
    keys='0'+''.join(TTFont(fontFile).getGlyphOrder()[3:])
    values='.8052431697'  #keys可用库提取,几天一变的values目前只能逐一手敲
    table=str.maketrans(keys,values)    #单对单,若某key或某value的len>1则用{}
    return font.translate(table)
 
def spiderParse(url):
    h={'User-Agent':UserAgent().random,'Referer':'http://www.baidu.com/'}
    soup=BeautifulSoup(requests.get(url,headers=h).text,'lxml')
    d={'公司名称':soup.select('#company_web_top .vertival-middle')[0].text}
    newFonts=[fontCorrect(sp.text) for sp in soup.select('.tyc-num')]
    d.update(dict(zip(['注册资本','注册时间','核准日期'],newFonts)))
    print(d)
 
def main():
    urls=['https://www.tianya删除ncha.com/company/24416401',]
    p=Pool(size=4)
    [p.spawn(spiderParse,url) for url in urls]
    p.join()
 
if __name__=='__main__':
    main()
**********************分割线**********************
CustomFontの实習僧:
 
import requests,re,pandas
from fake_useragent import UserAgent
from io import BytesIO
from base64 import b64decode
from fontTools.ttLib import TTFont
from gevent import monkey;monkey.patch_all()
from gevent.pool import Pool
 
def mapping(url):
    html=requests.get(url,headers={'User-Agent':UserAgent().random}).text
    fontText=re.findall('base64,(.+?)"',html)[0]
    fontFile=BytesIO(b64decode(fontText.encode()))
    zd={}
    for k,v in TTFont(fontFile).getBestCmap().items():
        zd[f'{k:x}']=chr(int(v[3:],16)) if v.startswith('uni') else v
    return zd
 
def spiderParse(url,zd):
    html=requests.get(url,headers={'User-Agent':UserAgent().random}).text
    html=re.sub('&#x(\w{4})',lambda r:zd[r[1]],html.split('<ul>')[0])
    info=re.findall('职位名称">(.+?)<.+?公司名称">(.+?)</a>(.+?)<.+?\
span>(.+?)<.+?/i>(.+?天).+?font">(.+?)<.+?font">(.+?)<',html,re.S)
    pandas.DataFrame(info).to_csv('sxs.csv',header=False,index=False,encoding='gbk',mode='a+')
 
def main():
    urls=[f'https://www.shix删除iseng.com/interns?k=爬虫&p={p}' for p in range(1,10)]
    zd=mapping(urls[0])
    p=Pool(size=4)
    [p.spawn(spiderParse,url,zd) for url in urls]
    p.join()
 
if __name__=='__main__':
    main()
**********************分割线**********************
CustomFontの猫睛的想看数、用户评分、评分人数、累计票房:
网页源代码中搜混淆字的class值,在style标签的js里发现了时刻在变的字体文件//vfile.*.woff。用工具如在线的FontEditor打开:蓝色的keys是在私人造字区E000至F8FF之间的unicode字串,可直接用fontTools库提取,再转为和源代码混淆字相同的格式;不过上方的黑色values也没规律,于是TTFont(*.woff).saveXML('*.xml'),观察几个.woff的.xml,发现glyf标签下,值同其TTGlyph块亦同。 
 
import requests,re
from fake_useragent import UserAgent
from io import BytesIO
from fontTools.ttLib import TTFont
from gevent import monkey;monkey.patch_all()
from gevent.pool import Pool
 
baseFontFile='D:/model.woff'    #事先保存1份在本地,用作当前各.woff比对的模型
baseKeys=tuple(TTFont(baseFontFile)['glyf'].glyphs.values())[2:]    #其类型不让用作key
baseValues='9427635801' #顺序输出除前俩的各单字,哪天不准了再换个model.woff
 
def fontMapping(customFontUrl):
    fonts=TTFont(BytesIO(requests.get(customFontUrl).content))['glyf']
    keys=[font[3:].lower() for font in fonts.glyphOrder[2:]]
    values=[baseValues[baseKeys.index(k)] for k in tuple(fonts.glyphs.values())[2:]]
    maps=dict(zip(keys,values))
    return maps
 
def spiderParse(url):
    html=requests.get(url,headers={'User-Agent':UserAgent().random}).text
    customFontUrl='http:'+re.findall('//vfile.+?woff',html)[0]
    maps=fontMapping(customFontUrl)
    html=re.sub('&#x(\w{4});',lambda r:maps[r[1]],html) #bs4会硬解析混淆字为乱码
    #print('还原后的网页源代码:\n'+html);exit()
    result={'电影名称':re.findall('h3 class="name">(.+?)<',html)[0]}
    data=re.findall('stonefont">(.+?)<',html)
    if len(data)==3:    #在各剧的主页提取数据,1页展示多剧的网站主页及榜单页没提取
        result.update(dict(zip(['用户评分','评分人数','累计票房'],data)))
    elif len(data)==2:
        result.update(dict(zip(['想看数','累计票房'],data)))
    elif len(data)==1:
        result.update({'想看数':data[0]})
    if result.get('累计票房'):
        result['累计票房']+=re.findall('unit">(.+?)<',html)[0]
    print(result)
 
def main():
    urls=['http://maoy删除an.com/films/342068',
        'http://maoy删an.com/films/338463','http://maoy删an.com/films/346272']
    p=Pool(size=4)
    [p.spawn(spiderParse,url) for url in urls]
    p.join()
 
if __name__=='__main__':
    main()
posted on 2017-10-19 17:43  史高治  阅读(375)  评论(0编辑  收藏  举报