Hadoop综合大作业

1.用Hive对爬虫大作业产生的文本文件（或者英文词频统计下载的英文长篇小说）词频统计。

f = open('note.txt', 'r')
song = f.read()
f.close()

def writeFilenote(contnet):

    f = open('newnote.txt', 'a', encoding='utf-8')
    f.write(contnet)
    f.close()

symbol = ''',.？！’;?!:"“”-%$'''

exclude = '''
a an the in on to at and of is was are were i he she you your they us their our it or for be too do no 
that s so as but it's
'''

for i in symbol:
    song = song.replace(i, ' ')
writeFilenote(song)
print(song)

先用python将文本当中的不合法词汇剔除，然后另存为newnote.txt

然后hive一系列猛操作，出现结果如下图。（过程不贴了，毕竟跟上次差不多）

2.用Hive对爬虫大作业产生的csv文件进行数据分析，写一篇博客描述你的分析过程和分析结果。

def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    NewsDict={}


    NewsDict['source']=soupd.select('.comeFrom')[0].select('a')[0].text
    NewsDict['title']=soupd.select('.headline')[0].text
    NewsDict['time']=soupd.select('#pubtime_baidu')[0].text
    #NewsDict['content'] = soupd.select('.artical-main-content')[0].text

    return NewsDict

def Get_page(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    pagelist=[]
    soup = BeautifulSoup(res.text, 'html.parser')
    # print(soup.select('.tag-list-box')[0].select('.list'))
    for new in soup.select('.tag-list-box')[0].select('.list'):
        #print(new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href'])
        url =new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href']
        pagedict=getNewsDetail(url)
        pagelist.append(pagedict)

    return pagelist
        #break
        # break

        # print(url)




url = 'https://voice.hupu.com/nba/tag/3023-1.html'
resd = requests.get(url)
resd.encoding = 'utf-8'
soup1 = BeautifulSoup(resd.text, 'html.parser')
total=[]
# listCount = int(soup.select('.a1')[0].text.rstrip('条'))//10+1
pagelist=Get_page(url)
total.extend(pagelist)

for i in range(2, 25):
    total.extend(Get_page('https://voice.hupu.com/nba/tag/3023-{}.html'.format(i)))
    pan = pandas.DataFrame(total)
    pan.to_csv('result3.csv')

---恢复内容结束---

1.用Hive对爬虫大作业产生的文本文件（或者英文词频统计下载的英文长篇小说）词频统计。

f = open('note.txt', 'r')
song = f.read()
f.close()

def writeFilenote(contnet):

    f = open('newnote.txt', 'a', encoding='utf-8')
    f.write(contnet)
    f.close()

symbol = ''',.？！’;?!:"“”-%$'''

exclude = '''
a an the in on to at and of is was are were i he she you your they us their our it or for be too do no 
that s so as but it's
'''

for i in symbol:
    song = song.replace(i, ' ')
writeFilenote(song)
print(song)

先用python将文本当中的不合法词汇剔除，然后另存为newnote.txt

然后hive一系列猛操作，出现结果如下图。（过程不贴了，毕竟跟上次差不多）

2.用Hive对爬虫大作业产生的csv文件进行数据分析，写一篇博客描述你的分析过程和分析结果。

def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    NewsDict={}


    NewsDict['source']=soupd.select('.comeFrom')[0].select('a')[0].text
    NewsDict['title']=soupd.select('.headline')[0].text
    NewsDict['time']=soupd.select('#pubtime_baidu')[0].text
    #NewsDict['content'] = soupd.select('.artical-main-content')[0].text

    return NewsDict

def Get_page(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    pagelist=[]
    soup = BeautifulSoup(res.text, 'html.parser')
    # print(soup.select('.tag-list-box')[0].select('.list'))
    for new in soup.select('.tag-list-box')[0].select('.list'):
        #print(new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href'])
        url =new.select('.list-content')[0] .select('.name')[0].select('.n1')[0].select('a')[0]['href']
        pagedict=getNewsDetail(url)
        pagelist.append(pagedict)

    return pagelist
        #break
        # break

        # print(url)




url = 'https://voice.hupu.com/nba/tag/3023-1.html'
resd = requests.get(url)
resd.encoding = 'utf-8'
soup1 = BeautifulSoup(resd.text, 'html.parser')
total=[]
# listCount = int(soup.select('.a1')[0].text.rstrip('条'))//10+1
pagelist=Get_page(url)
total.extend(pagelist)

for i in range(2, 25):
    total.extend(Get_page('https://voice.hupu.com/nba/tag/3023-{}.html'.format(i)))
    pan = pandas.DataFrame(total)
    pan.to_csv('result3.csv')