dogedoge浏览器爬取标题

复制代码
# coding:utf-8
import hashlib

import datetime
import lxml
import pymysql
import requests

from lxml import etree
import sys

reload(sys)

sys.setdefaultencoding('utf-8')

def search_data(kw, n):
    ll = []
    res = requests.get('https://www.dogedoge.com/results?q={}'.format(kw))
    if n > 1:
        res = requests.get('https://www.dogedoge.com/results?q={}&p={}'.format(kw, n))
    con = etree.HTML(res.text)
    url = con.xpath('//div[@class="result results_links_deep highlight_d result--url-above-snippet"]')
    for u in url:
        title = ''
        for i in u.xpath('./div/h2/a//text()'):
            title += i
        url = ''
        for i in u.xpath('./div/div/div/a/span//text()'):
            url += i
        domain = ''
        if url.find('http') != -1:
            domain = url.split('/')[2]
        else:
            domain = url.split('/')[0]
        md5 = hashlib.md5(url).hexdigest()
        item = {}
        item['keywd'] = kw
        item['domain'] = domain
        item['title'] = title
        item['md5'] = md5
        item['url'] = url
        item['searcher'] = 'dogedoge'
        ll.append(item)
    save(ll)
    try:
        next = con.xpath('//div[@id="rld-2"]')
    except:
        print '没有下一页了'
        return ''
    else:
        return next


def main(kw):
    n = 1
    while True:
        next_page = search_data(kw, n)
        if not next_page:
            break
        n += 1


def save(ll):
    db = pymysql.connect(
        host=MYSQL_HOST,
        db=MYSQL_DBNAME,
        user=MYSQL_USER,
        passwd=MYSQL_PASSWD,
        charset='utf8',
        use_unicode=True)
    cursor = db.cursor()
    for item in ll:
        # print type(item), item['searcher']
        try:
            # 插入数据库
            cursor.execute(
                "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)",
                (item['md5'],
                 item['keywd'],
                 item['title'],
                 item['url'],
                 datetime.datetime.now(),
                 item['domain'],
                 item['searcher']
                 ))
            # 提交sql语句
            db.commit()

        except Exception as error:
            # 出现错误时打印错误日志
            # print error
            # logger.error(error)
            db.rollback()
    cursor.close()
    db.close()

main('爬取关键词')
复制代码

 

posted @   乔小生1221  阅读(415)  评论(0编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· DeepSeek 开源周回顾「GitHub 热点速览」
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
点击右上角即可分享
微信分享提示