dogedoge浏览器爬取标题
# coding:utf-8 import hashlib import datetime import lxml import pymysql import requests from lxml import etree import sys reload(sys) sys.setdefaultencoding('utf-8') def search_data(kw, n): ll = [] res = requests.get('https://www.dogedoge.com/results?q={}'.format(kw)) if n > 1: res = requests.get('https://www.dogedoge.com/results?q={}&p={}'.format(kw, n)) con = etree.HTML(res.text) url = con.xpath('//div[@class="result results_links_deep highlight_d result--url-above-snippet"]') for u in url: title = '' for i in u.xpath('./div/h2/a//text()'): title += i url = '' for i in u.xpath('./div/div/div/a/span//text()'): url += i domain = '' if url.find('http') != -1: domain = url.split('/')[2] else: domain = url.split('/')[0] md5 = hashlib.md5(url).hexdigest() item = {} item['keywd'] = kw item['domain'] = domain item['title'] = title item['md5'] = md5 item['url'] = url item['searcher'] = 'dogedoge' ll.append(item) save(ll) try: next = con.xpath('//div[@id="rld-2"]') except: print '没有下一页了' return '' else: return next def main(kw): n = 1 while True: next_page = search_data(kw, n) if not next_page: break n += 1 def save(ll): db = pymysql.connect( host=MYSQL_HOST, db=MYSQL_DBNAME, user=MYSQL_USER, passwd=MYSQL_PASSWD, charset='utf8', use_unicode=True) cursor = db.cursor() for item in ll: # print type(item), item['searcher'] try: # 插入数据库 cursor.execute( "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)", (item['md5'], item['keywd'], item['title'], item['url'], datetime.datetime.now(), item['domain'], item['searcher'] )) # 提交sql语句 db.commit() except Exception as error: # 出现错误时打印错误日志 # print error # logger.error(error) db.rollback() cursor.close() db.close() main('爬取关键词')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· DeepSeek 开源周回顾「GitHub 热点速览」
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了