关于alzheimer disease论文的统计
1.获取2016年的所有关键字,保存到keyword_2016.json中
import pymysql import json conn= pymysql.connect( host='localhost', port = 3306, user='root', passwd='', db ='python', ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where pub_year = '2016' && union_kwd_str != '' " a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,'w') as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { 'pub_year': "2016", 'count': a, 'keyword': abstract_list } output_to_json(output_data, 'keyword_2016.json')
从keyword_2016。json中读取关键词,并统计选出前25的关键词
import re import collections import json def input_from_json(filename): with open(filename,'r') as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)['keyword'] for all_the_text in keyword_list: for word in all_the_text.split(','): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == '__main__': file_name = "keyword_2016.json" fobj2 = open('sort_keyword_2016.json','w') dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { 'name': key, 'value': value } json_data = json.dumps(data) if num < 25: fobj2.write(json_data) fobj2.write(',') if num == 25: fobj2.write(json_data)
2.获取发表论文量排名前十的国家
1)把所有第一作者的信息保存到authorinfor.json中
import pymysql import json conn= pymysql.connect( host='localhost', port = 3306, user='root', passwd='', db ='python', ) cursor = conn.cursor() sql = "SELECT authorinfor,pmc_id FROM alzheimer WHERE authorinfor != ''" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list authorinfor_list = [] pmc_id_dict= {} for j in range(a): authorinfor_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,'w') as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { 'pub_year': "2016", 'count': a, 'authorinfor': authorinfor_list, 'pmc_id': pmc_id_dict } output_to_json(output_data, 'authorinfor.json')
2)选出排名前十的国家
import re import collections import json def input_from_json(filename): with open(filename,'r') as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} authorinfor_list = input_from_json(path)['authorinfor'] for all_the_text in authorinfor_list: country = all_the_text.split(',')[-1] country = re.sub("\.","",country) country = re.sub("\\n","",country) country = country.encode('utf-8') if country not in result: result[country] = 0 result[country] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == '__main__': file_name = "authorinfor.json" fobj2 = open('sort_country.json','w') dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 for country,value in dword.items(): num += 1 data = { 'name': country, 'value': value } json_data = json.dumps(data) if num < 50: fobj2.write(json_data) fobj2.write('\n') countrylist = dword.keys() valuelist = dword.values() print countrylist[:11] print valuelist[:11]