正规函数编写、调用示例猫眼抓取

import requests,re,json,time
from requests.exceptions import RequestException
headers={
    'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    }
def get_one_page(url):
    r=requests.get(url,headers=headers)
    if r.status_code==200:
        return r.text
    return None

def parse_one_page(html):
    pattern = re.compile(
        '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items=re.findall(pattern,html)
    
    for item in items:
        yield{
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time': item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
            }
def write_to_file(content):
    with open('result.txt','a') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')

def main(offset):
    url='https://maoyan.com/board/4?offset='+str(offset)
    html=get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
    
if __name__=='__main__':
    for i in range(10):
        main(offset=i*10)
        time.sleep(1)

 

posted @ 2019-03-13 16:23  晨光曦微  阅读(227)  评论(0编辑  收藏  举报