目前完成了我爱我家和链家的房源信息获取,写了比较的粗糙~
houseWoaiwojia.py:
#encoding=utf-8 import result as r import db import datetime import houseLianjiaVo as vo import json import time def getdata(url): soup = r.getUrl(url) pList = soup.find_all("ul", class_="pList") all_li = pList[0].find_all("li") print("url:%s li数量:%i"%(url,len(all_li))) list = [] for i in all_li: listTit = i.find_all("h3", class_="listTit") if len(listTit) == 0 : continue title_a = listTit[0].find_all("a")[0] lazy_img = i.find_all("img", class_="lazy")[0] img = "" if "src" in lazy_img.attrs: img = lazy_img["src"] elif "data-src" in lazy_img.attrs: img = lazy_img["data-src"] href = domain + title_a["href"] code = json.loads(title_a["tdjson"])["content"] if code in arrayList: #print("code 已经存在") continue title = title_a.getText() try: listX_p = i.find_all("div",class_ = "listX")[0].find_all("p") houseInfo = listX_p[0].getText() region2 = listX_p[1].getText().split(" ")[0] region1 = listX_p[1].find_all("a")[0].getText() listX_p_2_test = listX_p[2].getText().split("·") release_time = listX_p_2_test[2] total_price = i.find_all("div", class_="jia")[0].find_all("strong")[0].getText() data_price = i.find_all("div", class_="jia")[0].find_all("p")[1].getText() house_structure = total_square = orientation = decoration_degree = floor = material = "" if houseInfo != None : h = houseInfo.split("·") if len(h) > 0 : # 房屋结构 house_structure = h[0].strip() # 总平方数 total_square = h[1].strip() # 朝向 orientation = h[2].strip() # 装修程度 decoration_degree = h[4].strip() # 楼层 floor = h[3].strip() # 材料 material = h[5].strip() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) v = vo.houseLianjiaVo(code,img,title,href,region1,region2, house_structure,total_square,orientation,decoration_degree,floor,material, create_time,release_time,total_price,data_price) list.append(v) arrayList.append(code) except BaseException as Argument : print("失败,url:%s,title:%s",url,title,Argument) return list def encapsulation_db(list): """dept表sql封装""" sql = """ insert into house values""" for i in range(len(list)): d = list[i] s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%( house_type, d.code, d.img, d.title, d.href, d.region1, d.region2, d.house_structure, d.total_square, d.orientation, d.decoration_degree, d.floor, d.material, d.create_time, d.release_time, d.total_price, d.data_price ) sql += s sql = sql [:len(sql)-1] + ";" return sql def date_util(date): print(date/1000) #时间有点问题差8小时 dateArray = datetime.datetime.utcfromtimestamp(date/1000) otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S") return otherStyleTime arrayList = [] domain = "https://hz.5i5j.com" house_type ="我爱我家" if __name__ == '__main__': db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306) publicity_list = [] list = db.selectDb("select code from house where type ='%s' "%house_type) for i in list : arrayList.append(i[0]) for i in range(1,101): page = "n%s"%i url = domain + "/ershoufang/" + page list = getdata(url) if len(list) > 0 : sql =encapsulation_db(list) try: db.insertDB(sql) print("插入数据库成功%s",page) except BaseException: print("插入数据库失败%s",page)
houseLianjia.py:
#encoding=utf-8 import result as r import db import datetime import houseLianjiaVo as vo import time def getdata(url): soup = r.getUrl(url) # print(soup) sellListContent = soup.find_all("ul", class_="sellListContent") all_li = sellListContent[0].find_all("li") print("url:%s li数量:%i"%(url,len(all_li))) list = [] for i in all_li: title_a = i.find_all("div", class_="title")[0].find_all("a")[0] img = i.find_all("img", class_="lj-lazy")[0]["src"] href = title_a["href"] code = title_a["data-housecode"] if code in arrayList: #print("code 已经存在") continue title = title_a.getText() try: # print(title_a) positionInfo_a = i.find_all("div", class_="positionInfo")[0].find_all("a") region1 = positionInfo_a[0].getText() region2 = positionInfo_a[1].getText() houseInfo = i.find_all("div", class_="houseInfo")[0].getText() house_structure = total_square = orientation = decoration_degree = floor = material = "" if houseInfo != None : h = houseInfo.split("|") if len(h) > 0 : # 房屋结构 house_structure = h[0] # 总平方数 total_square = h[1] # 朝向 orientation = h[2] # 装修程度 decoration_degree = h[3] # 楼层 floor = h[4] # 材料 material = h[5] release_time = i.find_all("div" , class_ = "followInfo")[0].getText().split("/")[1] total_price = i.find_all("div", class_="totalPrice")[0].find_all("span")[0].getText() data_price = i.find_all("div", class_="unitPrice")[0]["data-price"] create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) v = vo.houseLianjiaVo(code,img,title,href,region1,region2, house_structure,total_square,orientation,decoration_degree,floor,material, create_time,release_time,total_price,data_price) list.append(v) arrayList.append(code) except BaseException: print("失败,url:%s,title:%s",url,title) return list def encapsulation_db(list): """dept表sql封装""" sql = """ insert into house values""" for i in range(len(list)): d = list[i] s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%( house_type, d.code, d.img, d.title, d.href, d.region1, d.region2, d.house_structure, d.total_square, d.orientation, d.decoration_degree, d.floor, d.material, d.create_time, d.release_time, d.total_price, d.data_price ) # print(s) sql += s sql = sql [:len(sql)-1] + ";" return sql def date_util(date): print(date/1000) #时间有点问题差8小时 dateArray = datetime.datetime.utcfromtimestamp(date/1000) otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S") return otherStyleTime arrayList = [] house_type = "链家" if __name__ == '__main__': db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306) publicity_list = [] list = db.selectDb("select code from house where type ='%s' " % house_type) for i in list : arrayList.append(i[0]) for i in range(1,101): page = "pg%s"%i url = "https://hz.lianjia.com/ershoufang/" + page + "co32/" list = getdata(url) if len(list) > 0 : sql =encapsulation_db(list) try: db.insertDB(sql) print("插入数据库成功%s",page) except BaseException: print("插入数据库失败%s",page)
result.py
import requests from bs4 import BeautifulSoup import ip_list # 消息头 # headers ={ # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", # "Accept-Encoding":"gzip, deflate, br", # "Accept-Language":"zh-CN,zh;q=0.9", # "Cache-Control":"max-age=0", # "Connection":"keep-alive", # "Cookie":"TS013af1c1=01ef8f99f1d0871b269c53340fa6269029504a80c4e2e4fb5f8c141dc3ea1889dfe9b73e3546aaef50810eb7137614f90423824a3a; _gscu_415563124=710350909btrml20; _gscbrs_415563124=1; TS01dde381_77=087968f3e8ab280075ffa9551835e2b3bc9a0a6ef1f753105fc148025ab082fd79a1cb86cc3a8f52e22695c031dd504308dd5f77e7823800bc61bb50494d8e87319bf3b42d4db90a536ed3feb83a77f2f122231d05b1d9d2348a3ef88a547a3a4fa01ac0c30c12acae8ea546e6c1a1ba; TSf97de9a7_76=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f07e800607868f728157db181454561ee310058f9e829ec0810c6cdcb21744ee6aac2d22d2d391d9dec7ed93dd2cc97f0534a13176b017915a82198365ab759a9c450c111a80907ed69974e36be3d3b9a2329829301cd8625d168c2f1b3b00c879662fc185e5c040d86ecabf8d9fd0d7582082883f0e4517e9ed01aef8fa6c301b7e34fba91950ff8a73444c94299ebebf81d60a295b2b378cb7f282d8c42bde8c1c6278b6e33bec5e77c19753bc6bf5a685fefd3e5bc832bf7b228faa342f439fdc647c4c009f2c59d7051f66d584aecb72f84a3a0ae4ad34e90593f62365471bf182f873e90c607771894; TSPD_101=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba:087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f06300055e7cf7b71c21fcf57bb0d7a08e541b632d1e81bc2b89a1a0b150eb4c70f05a351fc3a1b4aa2c87583b1593295915bf8; a6c1b8e3d8ee43f7b55efdb3b44bd46e=WyIzNDA2NzQ2MzE0Il0; TS01dde381=01ef8f99f13c1ae3080e65ae71621810e05b79c0c6aa9bdd4bdd59420bc12c88c9cf07ae6c9dbbd39b34b715438e2a022c50ce917f; TSf97de9a7_27=087968f3e8ab2000afda1f036834422803e5ea6f78d56a8329a6c70c885298f80c660dbeab022de90855e1bd82092000ce04d01001dd1ea60427ae89da80b6ff03d8eafda5efd5e059e62400b4741136", # "Host":"www.12309.gov.cn", # "If-None-Match":'W/"10d1c-m2GHDG7mOl/LWWDK4ZcVfpV31es"', # "Sec-Fetch-Mode":"navigate", # "Sec-Fetch-Site":"none", # "Sec-Fetch-User":"?1", # "Upgrade-Insecure-Requests":"1", # "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", # } headers ={} # post请求发送json def postUrl(url,json_data): result = requests.post(url ,json = json_data,headers=headers ) return result.json() # post请求发送json def getUrlJson(url): result = requests.get(url ,headers=headers ) return result.json() def getUrl(url): """url返回BeautifulSoup对象""" # proxies = ip_list.get_ip_list_random()#代理ip # print(proxies) # content = requests.get(url,headers=headers,proxies=proxies).content content = requests.get(url,headers=headers).content soup = BeautifulSoup(content,"html.parser") return soup
db.py:
import pymysql class DataBaseHandle(object): ''' 定义一个 MySQL 操作类''' def __init__(self,host,username,password,database,port): '''初始化数据库信息并创建数据库连接''' # 下面的赋值其实可以省略,connect 时 直接使用形参即可 self.host = host self.username = username self.password = password self.database = database self.port = port self.db = pymysql.connect(self.host,self.username,self.password,self.database,self.port,charset='utf8') # 这里 注释连接的方法,是为了 实例化对象时,就创建连接。不许要单独处理连接了。 # # def connDataBase(self): # ''' 数据库连接 ''' # # self.db = pymysql.connect(self.host,self.username,self.password,self.port,self.database) # # # self.cursor = self.db.cursor() # # return self.db def insertDB(self,sql): ''' 插入数据库操作 ''' self.cursor = self.db.cursor() try: # 执行sql self.cursor.execute(sql) # tt = self.cursor.execute(sql) # 返回 插入数据 条数 可以根据 返回值 判定处理结果 # print(tt) self.db.commit() except Exception as ex: # 发生错误时回滚 self.db.rollback() print("数据库异常",ex) finally: self.cursor.close() def deleteDB(self,sql): ''' 操作数据库数据删除 ''' self.cursor = self.db.cursor() try: # 执行sql self.cursor.execute(sql) # tt = self.cursor.execute(sql) # 返回 删除数据 条数 可以根据 返回值 判定处理结果 # print(tt) self.db.commit() except: # 发生错误时回滚 self.db.rollback() print("数据库异常") finally: self.cursor.close() def updateDb(self,sql): ''' 更新数据库操作 ''' self.cursor = self.db.cursor() try: # 执行sql self.cursor.execute(sql) # tt = self.cursor.execute(sql) # 返回 更新数据 条数 可以根据 返回值 判定处理结果 # print(tt) self.db.commit() except: # 发生错误时回滚 self.db.rollback() print("数据库异常") finally: self.cursor.close() def selectDb(self,sql): ''' 数据库查询 ''' self.cursor = self.db.cursor() try: self.cursor.execute(sql) # 返回 查询数据 条数 可以根据 返回值 判定处理结果 data = self.cursor.fetchall() # 返回所有记录列表 print(data) # 结果遍历 # for row in data: # sid = row[0] # name = row[1] # # 遍历打印结果 # print('sid = %s, name = %s'%(sid,name)) return data except: print("数据库异常") finally: self.cursor.close() def closeDb(self): ''' 数据库连接关闭 ''' self.db.close() if __name__ == '__main__': DbHandle = DataBaseHandle('127.0.0.1','root','1qaz@WSX','test',3306) DbHandle.selectDb('select * from dept limit 5 ') DbHandle.closeDb()
实体封装houseLianjiaVo.py
class houseLianjiaVo: def __init__(self, code, img, title, href, region1, region2, house_structure, total_square, orientation, decoration_degree, floor,material, create_time,release_time, total_price, data_price): self.id = None # 自增id self.code = code # 主键 self.img = img # 图片 self.title = title # 标题 self.href = href # 链接 self.region1 = region1 # 小区名 self.region2 = region2 # 区域 self.house_structure = house_structure # 房屋结构 self.total_square = total_square # 总平方数 self.orientation = orientation # 朝向 self.decoration_degree = decoration_degree # 装修程度 self.floor = floor # 楼层 self.material = material # 材料 self.create_time = create_time # 创建时间 self.release_time = release_time # 发布时间 self.total_price = total_price # 总价 self.data_price = data_price # 单价 def __str__(self): return 'code:%s title:%s ' % (self.code, self.title)
数据库表结构
CREATE TABLE `house` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增id', `type` varchar(255) DEFAULT NULL, `code` varchar(32) DEFAULT NULL COMMENT '主键', `img` varchar(1000) DEFAULT NULL COMMENT '图片', `title` varchar(32) DEFAULT NULL COMMENT '标题', `href` varchar(256) DEFAULT NULL COMMENT '链接', `region1` varchar(32) DEFAULT NULL COMMENT '小区名', `region2` varchar(32) DEFAULT NULL COMMENT '区域', `house_structure` varchar(32) DEFAULT NULL COMMENT '房屋结构', `total_square` varchar(32) DEFAULT NULL COMMENT '总平方数', `orientation` varchar(32) DEFAULT NULL COMMENT '朝向', `decoration_degree` varchar(32) DEFAULT NULL COMMENT '装修程度', `floor` varchar(32) DEFAULT NULL COMMENT '楼层', `material` varchar(32) DEFAULT NULL COMMENT '材料', `create_time` varchar(32) DEFAULT NULL COMMENT '创建时间', `release_time` varchar(32) DEFAULT NULL COMMENT '发布时间', `total_price` varchar(32) DEFAULT NULL COMMENT '总价', `data_price` varchar(32) DEFAULT NULL COMMENT '单价', PRIMARY KEY (`id`) USING BTREE, KEY `idx_code` (`code`) USING BTREE COMMENT '编码索引' ) ENGINE=InnoDB AUTO_INCREMENT=14999 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='房屋信息表-链家';