python爬虫练习--爬取所有微博
Ajax,全称为Asynchronous JavaScript and XML,即异步的JavaScript和XML。它不是一门编程语言,而是利用JavaScript在保证页面不被刷新、页面链接不改变的情况下与服务器交换数据并更新部分网页的技术。
对于传统的网页,如果想更新其内容,那么必须要刷新整个页面,但有了Ajax,便可以在页面不被全部刷新的情况下更新其内容。在这个过程中,页面实际上是在后台与服务器进行了数据交互,获取到数据之后,再利用JavaScript改变网页,这样网页内容就会更新了。
可以到W3School上体验几个示例来感受一下:http://www.w3school.com.cn/ajax/ajax_xmlhttprequest_send.asp
代码如下:
1 #! /usr/bin/env python 2 # coding: utf-8 3 4 import requests 5 from pyquery import PyQuery as pq 6 import pymysql 7 from pymongo import MongoClient 8 import time 9 10 ''' 11 抓取所有微博信息,是通过Ajax进行分析抓取 12 ''' 13 14 15 headers = { 16 'Host': 'm.weibo.cn', 17 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 18 'X-Requested-With': 'XMLHttpRequest', 19 'Referer': 'https://m.weibo.cn/u/2830678474' 20 } 21 22 def get_page(page): 23 ''' 24 请求url,并获取内容 25 :param page: 26 :return: 27 ''' 28 params = { 29 'type':'uid', 30 'value':'2830678474', 31 'containerid':'1076032830678474', 32 'page':page 33 34 } 35 url = 'https://m.weibo.cn/api/container/getIndex' 36 try: 37 res = requests.get(url,headers=headers,params=params) 38 if res.status_code == 200: 39 return res.json() 40 except requests.ConnectionError as e: 41 print('Error',e.args) 42 43 44 45 def parse_page(json,page): 46 if json: 47 items = json.get('data').get('cards') 48 for index, item in enumerate(items): 49 if page == 1 and index == 1: 50 continue 51 else: 52 item = item.get('mblog') 53 weibo = {} 54 weibo['id'] = item.get('id') 55 weibo['text'] = pq(item.get('text')).text() 56 weibo['attitudes'] = item.get('attitudes_count') 57 weibo['comments'] = item.get('comments_count') 58 weibo['reposts'] = item.get('reposts_count') 59 yield weibo 60 61 62 #往MySQL中存储 63 def save_mysql(result): 64 65 table = 'weibo' 66 keys = ', '.join(result.keys()) 67 values = ', '.join(['%s'] * len(result)) 68 db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306,db='spiders',charset="utf8") 69 cursor = db.cursor() 70 sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) 71 try: 72 if cursor.execute(sql, tuple(result.values())): 73 print('Successful') 74 db.commit() 75 except Exception as e: 76 print('Failed',e.args) 77 db.rollback() 78 db.close() 79 80 81 #往mangoDB中存储 82 def save_to_mongo(result): 83 client = MongoClient() 84 db = client['weibo'] 85 collection = db['weibo'] 86 if collection.insert(result): 87 print('Saved to Mongo') 88 89 90 91 if __name__ == '__main__': 92 for page in range(1,11): 93 json = get_page(page) 94 results = parse_page(json,page) 95 for result in results: 96 print(result) 97 save_mysql(result) 98 time.sleep(1)