爬虫综合大作业
可以用pandas读出之前保存的数据:
newsdf = pd.read_csv(r'F:\duym\gzccnews.csv')
一.把爬取的内容保存到数据库sqlite3
import sqlite3
with sqlite3.connect('gzccnewsdb.sqlite') as db:
newsdf.to_sql('gzccnews',con = db)
with sqlite3.connect('gzccnewsdb.sqlite') as db:
df2 = pd.read_sql_query('SELECT * FROM gzccnews',con=db)
保存到MySQL数据库
- import pandas as pd
- import pymysql
- from sqlalchemy import create_engine
- conInfo = "mysql+pymysql://user:passwd@host:port/gzccnews?charset=utf8"
- engine = create_engine(conInfo,encoding='utf-8')
- df = pd.DataFrame(allnews)
- df.to_sql(name = ‘news', con = engine, if_exists = 'append', index = False)
作为一名爱运动的男生,我选择爬取的是季后赛nba新闻
这是生成爬虫的代码
def creat_bs(url): result = requests.get(url) e=chardet.detect(result.content)['encoding'] #set the code of request object to the webpage's code result.encoding=e c = result.content soup =BeautifulSoup(c,'lxml') return soup
构建要获取网页的集合函数
def build_urls(prefix,suffix): urls=[] for item in suffix: url=prefix+item urls.append(url) return urls
爬取函数
def find_title_link(soup): titles=[] links=[] try: contanier=soup.find('div',{'class':'container_padd'}) ajaxtable=contanier.find('form',{'id':'ajaxtable'}) page_list=ajaxtable.find_all('li') for page in page_list: titlelink=page.find('a',{'class':'truetit'}) if titlelink.text==None: title=titlelink.find('b').text else: title=titlelink.text if np.random.uniform(0,1)>0.90: link=titlelink.get('href') titles.append(title) links.append(link) except: print('have no value') return titles,links
保存数据
wordlist=str() for title in title_group: wordlist+=title for reply in reply_group: wordlist+=reply def savetxt(wordlist): f=open('wordlist.txt','wb') f.write(wordlist.encode('utf8')) f.close() savetxt(wordlist)
生成的词云
posted on 2019-05-09 19:55 刘杰_winslow 阅读(162) 评论(0) 编辑 收藏 举报