爬虫大作业
1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
from bs4 import BeautifulSoup import requests for i in range(1,10): res = requests.get('https://www.hongxiu.com/all?pageSize=10&gender=2&catId=-1&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0&pageNum='+str(i)) res.encoding='utf-8' soup = BeautifulSoup(res.text, 'html.parser') def save(): file_name = '新浪' with open(file_name+'.txt', 'a') as file: num = 1 for booklist in hongxiulist: file.write('\n') file.write('#' + str(num) +'. ' + booklist.title) file.write('\n') file.write('书名:{0}\n作者:{1}\n类型:{2}\n状态:{3}\n字数:{4}\n描述:{5}\n\n书本图片:{6}\n书本网址:{7}\n'.format(booklist.title,booklist.author,booklist.style,booklist.state,booklist.wordcount,booklist.abstract,booklist.imgurl,booklist.bookurl)) file.write('-*' * 100) file.write('\n') num = num + 1 class Info(object): def __init__(me, title, author, style, state, wordcount, abstract, imgurl, bookurl): me.title = title me.author = author me.style = style me.state = state me.wordcount = wordcount me.abstract = abstract me.imgurl = imgurl me.bookurl = bookurl for i in range(1,11): res = requests.get('https://www.hongxiu.com/all?pageSize=10&gender=2&catId=-1&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0&pageNum='+str(i)) res.encoding='utf-8' soup = BeautifulSoup(res.text, 'html.parser') hongxiu = soup.find('div', class_='right-book-list') hongxiulist = [] for list in hongxiu.find_all('li'): listinfo = list.find('div', class_='book-info') listinfo_href = listinfo.find('a') title = listinfo_href.text author = listinfo.find(class_='default').string.strip() style = listinfo.find(class_='org').string.strip() state = listinfo.find(class_='pink').string.strip() wordcount = listinfo.find(class_='blue').string.strip() abstract = listinfo.find(class_='intro').string.strip() img = list.find('div', class_='book-img') imgurl= 'https:'+img.find('img')['src'].strip() bookurl = 'https://www.hongxiu.com'+listinfo_href['href'].strip() booklist = Info(title, author, style, state, wordcount, abstract, imgurl, bookurl) hongxiulist.append(booklist) for booklist in hongxiulist: print('-*' * 100) print('书名:{0}\n作者:{1}\n类型:{2}\n状态:{3}\n字数:{4}\n描述:{5}\n\n书本图片:{6}\n书本网址:{7}\n'.format(booklist.title, booklist.author, booklist.style, booklist.state, booklist.wordcount, booklist.abstract, booklist.imgurl, booklist.bookurl)) save()