使用pyquery解析知乎发现【【CSS选择器】
使用pyquery解析
1 import requests 2 from pyquery import PyQuery as pq 3 import json 4 5 url = 'https://www.zhihu.com/explore' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 8 } 9 html = requests.get(url, headers=headers).text 10 doc = pq(html) 11 items = doc('.explore-tab .feed-item').items() 12 for item in items: 13 question = item.find('h2').text() 14 author = item.find('.author-link-line').text() 15 answer = pq(item.find('.content').html()).text() 16 q = item.find('.bio').text() 17 18 explore = { 19 "question" : question, 20 "author" : author, 21 "answer" : answer, 22 "q": q, 23 } 24 25 with open("explore.json", "a") as f: 26 #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 27 f.write(json.dumps(explore, ensure_ascii = False) + "\n")
attr()方法可获取属性
1 import requests 2 from pyquery import PyQuery as pq 3 import json 4 5 url = 'https://www.zhihu.com/explore' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 8 } 9 html = requests.get(url, headers=headers).text 10 doc = pq(html) 11 items = doc('.explore-tab .feed-item').items() 12 for item in items: 13 question = item.find('h2').text() 14 #author = item.find('.author-link-line').text() 15 author = item('.author-link-line').text() 16 #print(author) 17 answer = pq(item.find('.content').html()).text() 18 #q = item.find('.bio').text() 19 q = item.find('.bio').attr('title') 20 #print(q) 21 22 explore = { 23 "question" : question, 24 "author" : author, 25 "answer" : answer, 26 "q": q, 27 } 28 29 with open("explore.json", "a") as f: 30 #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 31 f.write(json.dumps(explore, ensure_ascii = False) + "\n")