正则应用之数据采集房屋网站信息

import re
import json
from urllib.request import urlopen
import ssl
# ⼲掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context

ershoufang_url='https://bj.lianjia.com/ershoufang/rs/'

def get_html_content(url):
    html=urlopen(url)
    content=html.read().decode('utf-8')
    # print(content)
    return content
def chuli(content):
    obj=re.compile(r'<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>',re.S)
    it=obj.finditer(content)
    for el in it:
        yield {
            '价格:':el.group('price')+'',
            '房屋信息:':el.group('title'),
            '平米数:':el.group('pingmi'),
            '朝向':el.group('fangxiang'),
            '装修:':el.group('zhuangxiu').replace('<span>/</span>',','),
            '房本信息:':el.group('fangben').replace('随时看房','无信息').replace('关注','无信息'),
        }
def xieru(jieguo):
    txt=json.dumps(jieguo,ensure_ascii=False)
    with open('houseInfo',mode='a',encoding='utf-8')as f:
        f.write(txt+'\n')

def main():
    for i in range(1,101):
        if i ==1:
            new_content = get_html_content(ershoufang_url)
        else:
            dong_url='https://bj.lianjia.com/ershoufang/pg%d/'%i
            new_content = get_html_content(dong_url)
        ret = chuli(new_content)
        for el in ret:
            xieru(el)
            print(el)

if __name__=='__main__':
    main()

 

posted @ 2018-10-14 11:12  冒蓝火的加特林哒哒哒  阅读(285)  评论(0编辑  收藏  举报