爬虫大作业
def get_content_info(content_url): res = requests.get(content_url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') return soup.select('#endText')[0].text def get_page_info(page_url): res = requests.get(page_url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') json_str = soup.select('script')[4].text.replace('var data_list =', '') data = demjson.decode(json_str) for i in data: return get_content_info(i['url']) def print_words_count(text, top): miss_word = "了|他|说|我|你|就|着|又|的|在|是|有|把|到|也|不|都|她|这|便|去|们|还|但|一个|和|却|里|来|要|没|很|\"" \ "|那|么|一|将|呢|起|于|上|只|得|而|而且|对|所以|见|些|才|从|过|被|并|时|且|给|道|虽然|可以|出|;|=" text = re.sub("[\s+\.\!\/_\",$%^*+—()?【】“《;》”!\-:,。?、~@#¥%……&*()]+", "", text) text = re.sub(miss_word + '+', "", text) words = list(jieba.cut(text)) key_words = {} for i in set(words): # 统计出词频 key_words[i] = words.count(i) sort_word = sorted(key_words.items(), key=lambda d: d[1], reverse=True) # 排序 for j in range(top): # 输出 print(sort_word[j]) cn = '' url = 'http://renjian.163.com/special/renjian_jishi/' res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') page = int(soup.select('.list_page')[0].select('a')[-2].text) for p in range(2, page): if p < 10: cn += get_page_info('http://renjian.163.com/special/renjian_jishi_0{}/'.format(p)) else: cn += get_page_info('http://renjian.163.com/special/renjian_jishi_{}/'.format(p)) print_words_count(cn, 120)
爬取的是网易——人间——记事栏目的文章