0426
import re import jieba as jieba import pandas as pd from wxpy import * #初始化,扫码登录 bot = Bot() #获取好友 my_friends = bot.friends() print(type(my_friends)) # 使用一个字典统计好友男性和女性的数量 sex_dict = {'male': 0, 'female': 0} for friend in my_friends: # 统计性别 if friend.sex == 1: sex_dict['male'] += 1 elif friend.sex == 2: sex_dict['female'] += 1 print(sex_dict) # 使用一个字典统计各省好友数量 province_dict = {'北京': 0, '上海': 0, '天津': 0, '重庆': 0, '河北': 0, '山西': 0, '吉林': 0, '辽宁': 0, '黑龙江': 0, '陕西': 0, '甘肃': 0, '青海': 0, '山东': 0, '福建': 0, '浙江': 0, '台湾': 0, '河南': 0, '湖北': 0, '湖南': 0, '江西': 0, '江苏': 0, '安徽': 0, '广东': 0, '海南': 0, '四川': 0, '贵州': 0, '云南': 0, '内蒙古': 0, '新疆': 0, '宁夏': 0, '广西': 0, '西藏': 0, '香港': 0, '澳门': 0} # 统计省份 for friend in my_friends: if friend.province in province_dict.keys(): province_dict[friend.province] += 1 # 为了方便数据的呈现,生成JSON Array格式数据 data = [] for key, value in province_dict.items(): data.append({'name': key, 'value': value}) print(data) def write_txt_file(path, txt): ''' 写入txt文本 ''' with open(path, 'a', encoding='gb18030', newline='') as f: f.write(txt) # 统计签名 for friend in my_friends: # 对数据进行清洗,将标点符号等对词频统计造成影响的因素剔除 pattern = re.compile(r'[一-龥]+') filterdata = re.findall(pattern, friend.signature) write_txt_file('signatures.txt', ''.join(filterdata)) def read_txt_file(path): ''' 读取txt文本 ''' with open(path, 'r', encoding='gb18030', newline='') as f: return f.read() content = read_txt_file(txt_filename) segment = jieba.lcut(content) words_df=pd.DataFrame({'segment':segment}) stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep=" ",names=['stopword'],encoding='utf-8') words_df=words_df[~words_df.segment.isin(stopwords.stopword)] import numpy words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"],ascending=False) from scipy.misc import imread from wordcloud import WordCloud, ImageColorGenerator # 设置词云属性 color_mask = imread('background.jfif') wordcloud = WordCloud(font_path="simhei.ttf", # 设置字体可以显示中文 background_color="white", # 背景颜色 max_words=100, # 词云显示的最大词数 mask=color_mask, # 设置背景图片 max_font_size=100, # 字体最大值 random_state=42, width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话, # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 ) # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数 word_frequence = {x[0]:x[1]for x in words_stat.head(100).values} print(word_frequence) word_frequence_dict = {} for key in word_frequence: word_frequence_dict[key] = word_frequence[key] wordcloud.generate_from_frequencies(word_frequence_dict) # 从背景图片生成颜色值 image_colors = ImageColorGenerator(color_mask) # 重新上色 wordcloud.recolor(color_func=image_colors) # 保存图片 wordcloud.to_file('output.png') plt.imshow(wordcloud) plt.axis("off") plt.show()
import re import requests from bs4 import BeautifulSoup from datetime import datetime import time import random import pandas as pd def clickCounts(url): id = re.findall('\d+',url)[-1] clickUrl = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(id) clickStruct = requests.get(clickUrl).text clickCounts = int(clickStruct.split('.html')[-1][2:-3]) return clickCounts def newsdt(showInfo): newsDate = showInfo.split()[0].split(':')[1] newsTime = showInfo.split()[1] newsDT = newsDate + ' ' + newsTime dt = datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S') return dt anews(url): newsDetail = {} res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text,'html.parser') newsDetail['newsTitle'] = soup.select('.show-title')[0].text showInfo = soup.select('.show-info')[0].text newsDetail['newsDT'] = newsdt(showInfo) newsDetail['newsClick'] = clickCounts(url) return newsDetail newsUrl = "http://news.gzcc.cn/html/2019/xiaoyuanxinwen_0404/11155.html" print(anews(newsUrl)) def alist(listUrl): res = requests.get(listUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text,'html.parser') newsList = [] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsUrl = news.select('a')[0]['href'] newsDesc = news.select('.news-list-description')[0].text newsDict = anews(newsUrl) newsDict['newsUrl'] = newsUrl newsDict['description'] = newsDesc newsList.append(newsDict) return newsList listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' allnews = alist(listUrl) for newtro in allnews: print(newtro) allnews = [] for i in range(2,12): listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) allnews.extend(alist(listUrl)) for n in allnews: print(n) #统计所爬取的新闻总数 print(len(allnews)) import sqlite3 with sqlite3.connect('gzccnewsdb.sqlite') as db: newsdf.to_sql('gzccnews',con=db) with sqlite3.connect('gzccnewsdb.sqlite') as db: df2 = pd.read_sql_query('SELECT * FROM gzccnewsdb',con=db) df2
import sqlite3 with sqlite3.connect('gzccnewsdb.sqlite') as db: newsdf.to_sql('gzccnews',con=db) with sqlite3.connect('gzccnewsdb.sqlite') as db: df2 = pd.read_sql_query('SELECT * FROM gzccnews',con=db) df2 !pip install PyMySQL !pip install sqlalchemy