#补交作业
cc = ('''Counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be But baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Yeah, we'll be counting stars I see this life Like a swinging vine Swing my heart across the line In my face is flashing signs Seek it out and ye shall find Old, but I'm not that old Young, but I'm not that bold And I don't think the world is sold I'm just doing what we're told I, feel something so right But doing the wrong thing I, feel something so wrong But doing the right thing I could lie, could lie, could lie everything that kills me makes me feel alive Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be, we'll be counting stars I feel the love And I feel it burn Down this river every turn Hope is a four letter word Make that money Watch it burn Old, but I'm not that old Young, but I'm not that bold And I don't think the world is sold I'm just doing what we're told I, feel something so wrong But doing the right thing I could lie, could lie, could lie Everything that drowns me makes me wanna fly Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be counting stars Lately I've been, I've been losing sleep Dreaming 'bout the things that we could be Baby I've been, I've been prayin' hard Said no more counting dollars We'll be, we'll be counting stars Take that money And watch it burn Sink in the river ''') cc = cc.replace('.', ' ') ccList = cc.split() print(len(cc), ccList) # 分隔一个单词并统计英文单词个数 ccSet = set(ccList) # 将列表转化成集合,再将集合转化成字典来统计每个单词出现个数 print(ccSet) strDict = {} # for star in ccSet: # strDict[star] = ccList.count(star) # print(strDict, len(strDict)) for star in ccSet: strDict[star]=cc.count(star) for key in ccSet: print(key,strDict[key]) wclist=list(ccSet.items()) print(wclist) # def takeSecond(elem): # return elem[1] # wclist.sort(key=takeSecond,reverse=True) # print(wclist) #按词频排序 wcList=list(strDict.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) #输出TOP(20) for i in range(20): print(wcList[i]) # 列表的遍历 cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd'] print(cclist) cclist.append('gegeheh') print(cclist) cclist.pop(2) print(cclist) for i in cclist: print(i) # 元组的遍历 tuple = ('jtfjhrr', 'rqfw f2q', 800, 10) print(tuple[2]) for i in tuple: print(i) # 字典的遍历 dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'} print('fhehe:', dic['fhehe']) print('4w6436:', dic['4w6436']) dic['4w6436'] = 8; dic['4w6436'] = "对接欧文机房的维护" print('4w6436:', dic['4w6436']) print('4w6436:', dic['4w6436']) for key in dic: print(key, ':', dic.get(key)) # 集合的遍历 a = set([1, 2, 3, 6, 5]) print(a) a.add(4) print(a) a.add('uteru') print(a) a.remove(5) print(a) for i in a: print(i)
#此次作业
fo=open('ccc1015.txt','r',encoding='utf-8') strBig=fo.read().lower() fo.close() print(strBig) #字符串预处理:#大小写,标点符号,特殊符号 sep=""".,:;!?""" for ch in sep: strBig=strBig.replace(ch,'') strlist=strBig.split() print(len(strlist),strlist) strSet=set(strlist) exclude={'is','be','be','I','we','the','in'} strSet=strSet-exclude print(len(strSet),strSet) strDict={} for word in strSet: strDict[word]=strlist.count(word) print(len(strDict),strDict) #按词频排序 wcList=list(strDict.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) #输出TOP(20) for i in range(20): print(wcList[i]) # 中文版 #读取文本文件 f = open('shengxu.txt','r',encoding='utf-8') story = f.read() f.close() print(story) #预处理 sep = ',。:“”?!''' #符号处理 for ch in sep: story=story.replace(ch,' ') #利用for循环语句把特殊符号替换成空格 print(story) #中文分词:结巴 import jieba cnStr = story #精确模式 print(list(jieba.cut(cnStr))) # 分隔提取单词 strList = list(jieba.cut(cnStr)) print(len(strList), strList) # 单词计数字典 strSet = set(strList) print(len(strSet), strSet) strDict = {} for word in strSet: strDict[word] = strList.count(word) # print(len(strDict),strDict) # 词频排序 wcList = list(strDict.items()) # print(wcList) wcList.sort(key=lambda x: x[1], reverse=True) # print(wcList) # 输出TOP10 for i in range(10): print(wcList[i])