Python 爬虫实例(2)—— 爬取今日头条
# coding:utf-8 import base64 import random, re import sqlite3 import redis, pickle import json, time import urllib3,urllib2,hashlib from datetime import datetime import threading import logging.handlers import sys reload(sys) sys.setdefaultencoding('utf-8') import uuid import requests session = requests.session() #把连接加密成 MD5 生成唯一的主键 def md5(str): import hashlib m = hashlib.md5() m.update(str) return m.hexdigest() def jinri(): list_data = [] for i in range(1,20): #请求得到url 链接 url = "http://www.toutiao.com/api/pc/feed/" data = { "category":"news_game", "utm_source":"toutiao", "widen":str(i), "max_behot_time":"0", "max_behot_time_tmp":"0", "tadrequire":"true", "as":"479BB4B7254C150", "cp":"7E0AC8874BB0985", } headers = { "Host":"www.toutiao.com", "Connection":"keep-alive", "Accept":"text/javascript, text/html, application/xml, text/xml, */*", "X-Requested-With":"XMLHttpRequest", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Content-Type":"application/x-www-form-urlencoded", "Referer":"http://www.toutiao.com/ch/news_hot/", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", } result1 = session.get(url=url,params=data,headers=headers).text result2 =json.loads(result1) if result2["message1"] =="success": for i in result2["data"]: source_url =i["source_url"] headers = { "Host":"www.toutiao.com", "Connection":"keep-alive", "Cache-Control":"max-age=0", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", } url1 = "http://www.toutiao.com" + str(source_url) try: return_data = session.get(url=url1, headers=headers).content except: pass # print return_data try: contentData = re.findall(' <article>(.*?)</article>',return_data)[0] except: contentData = "" cx = sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3",check_same_thread=False) cx.text_factory = str try: print "正在插入链接 %s 数据" % (url) chinese_ta = i["chinese_tag"] media_avatar_url = i["media_avatar_url"] is_feed_ad = i["is_feed_ad"] tag_url = i["tag_url"] title = i["title"] tag = i["tag"] label = str(i["label"]) abstract = i["abstract"] source_url = i["source_url"] print title print chinese_ta print media_avatar_url print is_feed_ad print tag_url print tag print label print abstract print source_url url2 = md5(str(url1)) cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)", (str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData))) cx.commit() # time.sleep(2) except Exception as e: print e print "cha ru shi bai " cx.close() else: print "请求失败" return list_data print jinri()
爬虫很简单,难的是自己去分析网页解析网页和爬虫的效率
如果觉得对您有帮助,麻烦您点一下推荐,谢谢!
好记忆不如烂笔头
好记忆不如烂笔头
分类:
Python 爬虫实例
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 提示词工程——AI应用必不可少的技术
2016-06-30 psycopg2.pool – Connections pooling / psycopg2.pool – 连接池 / postgresql 连接池
2016-06-30 用xshell操作linux系统的常用命令