爬取链家二手房

1.将爬取出来的数据存储到scv, mysql, mongo数据库中,利用正则的方式爬取。

import csv
import warnings

import pymongo
import pymysql
import requests
import re


class LIANJIA:

    def __init__(self):
        self.url = "https://%s.lianjia.com/ershoufang/"
        # self.proxies = {"HTTP", "http://61.152.248.147:80"}
        self.proxies = {"HTTP": "http://116.255.162.107:16816"}
        self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"}

        # 连接pymysql数据库
        # self.db = pymysql.connect("localhost", "root", "123456", charset="utf8")
        # self.cursor = self.db.cursor()

        # 连接mongo数据库
        self.conn = pymongo.MongoClient("localhost", 27017)
        self.db = self.conn["lianjia"]
        self.tab = self.db.lianjiafang

    def getHtml(self, url):
        response = requests.get(url, proxies=self.proxies, headers=self.headers)
        response.encoding = "utf-8"
        html = response.text
        self.parse(html)

    # 利用正则来匹配数据
    def parse(self, html):
        re_str = '<div class="info clear">.*?data-el="region">(.*?)</a>.*?class="totalPrice"><span>(.*?)</span>.*?<span>(.*?)</span></div>'
        p = re.compile(re_str, re.S)
        # 利用正则匹配,返回的是一个集合列表
        result_list = p.findall(html)
        print(result_list)
        self.saveTomongo(result_list)

    # 存入csv
    def saveTocsv(self, result_list):
        for result in result_list:
            print(result)
            with open("lianjia.csv", "a", newline="") as f:
                writer = csv.writer(f)
                writer.writerow(result)

    # 将数据存入mysql数据库中
    def saveTomysql(self, result_list):
        cd_db = "create database if not exists lianjia charset utf8"
        u_db = "use lianjia"
        c_tab = "create table if not exists lianjiafang(\
                            id int primary key auto_increment,\
                            name varchar(100),\
                            price varchar(100),\
                            sq_mPrice varchar(100)\
                            )"
        ins = "insert into lianjiafang(name, price, sq_mPrice)\
                        values(%s, %s, %s)"
        warnings.filterwarnings("ignore")
        try:
            self.cursor.execute(cd_db)
            self.cursor.execute(u_db)
            self.cursor.execute(c_tab)
        except:
            pass
        # 插入记录
        for result_tuple in result_list:
            L = [result_tuple[0].strip(), int(result_tuple[1].strip())*10000, result_tuple[2].strip()]
            # execute(ins, [列表])
            self.cursor.execute(ins, L)
            self.db.commit()
            print("OK")

    # 存入mongo
    def saveTomongo(self, result_list):
        for result_tuple in result_list:
            name = result_tuple[0].strip()
            price = int(result_tuple[1].strip()) * 10000
            sq_mPrice = result_tuple[2].strip()
            d = {"name":name, "star":price, "time":sq_mPrice}
            self.tab.insert(d)
        print("OK")

    def workOn(self):
        city = input("请输入你要搜索的城市首拼音:")
        end = int(input("爬取多少页:"))
        for x in range(1, end+1):
            if x == 1:
                url = self.url % city
                self.getHtml(url)
            else:
                url = (self.url % city) + "pg" + str(x) + "/"
                self.getHtml(url)
        # self.cursor.close()
        # self.db.close()


if __name__ == "__main__":
    lianjia = LIANJIA()
    lianjia.workOn()

得到csv中的结果,mysql和mongo结果就不显示了:

 

posted on 2018-11-26 18:52  zengsf  阅读(195)  评论(0编辑  收藏  举报

导航