简单的python爬虫保存百度、360 搜索内容到数据库
import requests
import re
from pyquery import PyQuery as Pq
import pymysql.cursors
connection = pymysql.connect(host='localhost',user='root',password='lihang',db='report',charset='utf8',cursorclass=pymysql.cursors.DictCursor)
inssql = "INSERT INTO `gamble` (`url`, `title`,`detailurl`) VALUES (%s, %s, %s)"
selsql = "SELECT * FROM `gamble` WHERE `url`=%s"
s = requests.session()
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding":"gzip, deflate",
"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
"Content-Type":"application/x-www-form-urlencoded",
'Connection':'keep-alive',
'DNT':'1',
'Content-Type':'application/x-www-form-urlencoded'
}
url_360 ="https://www.so.com/s"
pqyload_360 = {
'q':'房地产',#设置搜索关键字
'pn':1,
'ie':'utf8'
}
url_baidu ="https://www.baidu.com/s"
pqyload_baidu = {
'wd':'房地产',#设置搜索关键字
'pn':0,
'tn':'monline_4_dg',
'ie':'utf-8'
}
baimingdan = {#不采集详细网址中包含下列关键字的网站
"baidu.com",
"douban.com",
"tianya.cn"
}
def getbaidu():
for i in range(100): #设置循环页数
print(i)
i=i+1
r=s.get(url_baidu,params=pqyload_baidu,headers=headers)
page=Pq(r.content.decode('utf-8'))
baiduUrls = []
for site in page('div.result.c-container h3.t a').items():
baiduUrls.append((site.attr('href'),site.text()))
for tmpurl in baiduUrls:
flag=True
try:
tmpPage = s.get(tmpurl[0],allow_redirects=False)#得到真实页
try:
Ehttpurl = re.match(r"http://.*?/",tmpPage.headers.get('location')).group(0)
for bb in baimingdan:#判断白名单
if bb in tmpPage.headers.get('location'):
flag = False
break
else:
flag = True
if flag:
with connection.cursor() as cursor:
cursor.execute(selsql, (Ehttpurl))
result = cursor.fetchone()
if(result==None):
cursor.execute(inssql, (Ehttpurl, tmpurl[1],tmpPage.headers.get('location')))
connection.commit()
except Exception as e:
print(e)
except Exception as e:
print(e)
pqyload_baidu["pn"]+=10#循环结束,开始下一页
def get360 ():
for i in range(100): #设置循环页数
i=i+1
print(i)
r=s.get(url_360,params=pqyload_360,headers=headers)
page=Pq(r.content.decode('utf-8'))
baiduUrls = []
for site in page('ul.result h3.res-title a').items():
baiduUrls.append((site.attr('href'),site.text()))
for tmpurl in baiduUrls:
flag=True
try:
tmpPage = s.get(tmpurl[0])#得到真实页
try:
detailurl = re.search(r'URL=\'(.*?)\'', tmpPage.content.decode('utf-8'), re.S)
httpurl = re.match(r"http://.*?/",detailurl.group(1)).group(0)
for bb in baimingdan:#判断白名单
if bb in detailurl.group(1):
flag = False
break
else:
flag = True
if flag:
with connection.cursor() as cursor:
cursor.execute(selsql, (httpurl))
result = cursor.fetchone()
if(result==None):
cursor.execute(inssql, (httpurl, tmpurl[1],detailurl.group(1)))
connection.commit()
except Exception as e:
print(e)
except Exception as e:
print(e)
pqyload_360["pn"]+=1#循环结束,开始下一页
get360()
getbaidu()
数据库语句
CREATE TABLE `yellow` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) DEFAULT NULL,
`detailurl` varchar(255) DEFAULT NULL,
`time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
`lv` varchar(255) DEFAULT NULL,
`subtime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`title` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
有问题请联系hudcan@sina.com
个人网站:http://ext.123cc.cc