12 所学和感受

今天呢，完成了爬取相关数据到数据库，热词目录，热词，热词解释，以及热词相关的文章标题和链接，我把它爬取到三个表中。这样在写后台就方便提取数据。

import requests
from lxml import etree
import time
import pymysql
import datetime
import urllib
import json
import jieba
import pandas as pd
import re
import os
from collections import Counter

conn = pymysql.connect(
        host="localhost",
        user="root",
        port=3306,
        password="123456",
        database="hotwords")
#将热词插入数据库
def dbinsertwords(k,indexname):
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO hotwords (name,newstype) VALUES (%s,%s);",
        [k, indexname])
    conn.commit()
    cursor.close()
def dbgetWordsName():
    cursor =conn.cursor()
    cursor.execute("select name from hotwords;")
    datas = cursor.fetchall()
    data = []
    for row in datas:
        data.append(row[0])
    return data
    cursor.close()
    conn.commit()
    conn.close()
def dbgetWordsId():
    cursor = conn.cursor()
    cursor.execute("select * from hotwords;")
    datas = cursor.fetchall()
    data = []
    for row in datas:
        data.append(row[0])
    return data
    cursor.close()
    conn.commit()
    conn.close()
def dbinsertExplain(id,word,explain):
    print(id)
    print(word)
    print(explain)
    cursor = conn.cursor()
    sql ="update hotwords set wordexplain='"+explain+"'where wordsid= "+str(id)
    print(sql)
    cursor.execute(sql)
    # cursor.close()
    conn.commit()
    # conn.close()
def dbgethotwords():
    cursor = conn.cursor()
    cursor.execute("select * from hotwords;")
    datas = cursor.fetchall()
    data = []
    data2 = []
    for row in datas:
        data.append(row[1])
        data2.append(row[2])
    return data,data2
    cursor.close()
    conn.commit()
    conn.close()

def dbinsertHref(title,url,name,type):
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO title_href (title,href,wordname,newstype) VALUES (%s,%s,%s,%s);",
        [title, url,name,type])
    conn.commit()
    cursor.close()

def loopNewsType():
    for i in range(1101,1111):
        if i == 1104 or i ==1105 or i ==1106 or i ==1107 or i ==1108 or i ==1109:
            i = i+5
        elif i == 1110:
            i = 1199
        #遍历页数
        getDiffPage(i)

def getDiffPage(i):
    if i == 1199:
        #86页数据
        for page in range(0, 5):
            #得到每页信息
            type = '其他'
            getEachPage(page,i,type)
    else:
        #100页数据
        for page in range(0, 5):
            # 得到每页信息
            type = '一般'
            getEachPage(page,i,type)
def getEachPage(page,i,type):
    url = "https://news.cnblogs.com/n/c" + str(i) +"?page=" +str(page)
    r = requests.get(url)
    html = r.content.decode("utf-8")
    html1 = etree.HTML(html)

    href = html1.xpath('//h2[@class="news_entry"]/a/@href')
    title = html1.xpath('//h2[@class="news_entry"]/a/text()')
    indexname1 = html1.xpath('//div[@id = "guide"]/h3/text()')
    indexname = indexname1[0].replace(' ', '').replace('/','')
    #indexname为热词新闻类型
    print(indexname)
    file = open("middle/"+indexname+".txt", "a+", encoding='utf-8')
    print(len(href))
    for a in range(0, 18):
        print(href[a],title[a])
        #得到标题和链接
        #getDetail(href[a], title[a])
        file.write(title[a]+ '\n')
    print("页数:"+str(page))
    if type == '一般' and page ==4:
        print("函数里")
        file = open("middle/" + indexname + ".txt", "r", encoding='utf-8')
        getKeyWords(file,indexname)

    if type == '其他' and page == 4:
        file = open("middle/" + indexname + ".txt", "r", encoding='utf-8')
        getKeyWords(file, indexname)
#分析词频，得到热词
def getKeyWords(filehandle,indexname):
    print("getKeyWords")
    mystr = filehandle.read()
    #print(mystr)
    seg_list = jieba.cut(mystr)  # 默认是精确模式
    print(seg_list)
    stopwords = {}.fromkeys([line.rstrip() for line in open(r'stopwords.txt')])
    c = Counter()
    for x in seg_list:
        if x not in stopwords:
            if len(x) > 1 and x != '\r\n':
                c[x] += 1
    print('\n词频统计结果：')
    for (k, v) in c.most_common(10):  # 输出词频最高的前两个词
        print("%s:%d" % (k, v))
        dbinsertwords(k,indexname)
    #print(mystr)
    filehandle.close();
def wordsExplain():
    # for root, dirs, files in os.walk("final"):
    #     print(files)  # 当前路径下所有非目录子文件
    # print(len(files))
    # for i in range(0,len(files)):
    #     # filename = files[i].replace(".txt","")
    #     #热词名称
    #     print(files[i])
    #     getExpalin(files[i])
    #     #break
    words = dbgetWordsName()
    ids =dbgetWordsId()
    print(len(words))
    print(len(ids))
    for i in range(0,len(words)):
        #print(words[i])
        explain = climingExplain(words[i])
        #print(explain)
        if ids[i] == None:
            pass
        else:
            dbinsertExplain(ids[i],words[i],explain)
def getExpalin(filename):
    lines =[]
    for line in open("final/"+filename,encoding='utf-8'):
        explain =  climingExplain(line)
        line = line +"\t"+explain
        print(explain)
        print("line:"+line.replace("\n",""))
        lines.append(line.replace("\n",""))
        #f = open("final/"+filename, 'w+',encoding='utf-8')
        # f.write(line + "\n")
    f = open("final/"+filename, 'w+',encoding='utf-8')
    for i in range(0, len(lines)):
        f.write(lines[i] + "\n")
    f.close()

#爬取解释
def climingExplain(line):
    line1=line.replace('\n','')
    #print(line1)
    url = "https://baike.baidu.com/item/"+str(line1)
    #print(url)
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        'cookie':'BAIDUID=AB4524A16BFAFC491C2D9D7D4CAE56D0:FG=1; BIDUPSID=AB4524A16BFAFC491C2D9D7D4CAE56D0; PSTM=1563684388; MCITY=-253%3A; BDUSS=jZnQkVhbnBIZkNuZXdYd21jMG9VcjdoanlRfmFaTjJ-T1lKVTVYREkxVWp2V2RlSVFBQUFBJCQAAAAAAAAAAAEAAACTSbM~Z3JlYXTL3tGpwOTS9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACMwQF4jMEBed; pcrightad9384=showed; H_PS_PSSID=1454_21120; delPer=0; PSINO=3; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjsv5_shitong=1.0_7_a3331e3bd00d7cbd253c9e353f581eb2494f_300_1581332649909_58.243.250.219_d03e4deb; yjs_js_security_passport=069e28a2b81f7392e2f39969d08f61c07150cc18_1581332656_js; Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1580800784,1581160267,1581268654,1581333414; BK_SEARCHLOG=%7B%22key%22%3A%5B%22%E7%96%AB%E6%83%85%22%2C%22%E6%95%B0%E6%8D%AE%22%2C%22%E9%9D%9E%E6%AD%A3%E5%BC%8F%E6%B2%9F%E9%80%9A%22%2C%22mapper%22%5D%7D; Hm_lpvt_55b574651fcae74b0a9f1cf9c8d7c93a=1581334123'
    }
    r = requests.get(url,headers = head)
    #print(r.status_code)
    html = r.content.decode("utf-8")
    #print(html)
    html1 = etree.HTML(html)
    #print(html1)
    content1 = html1.xpath('//div[@class="lemma-summary"]')
    #print(content1[0])
    if len(content1)==0:
        #custom_dot  para-list list-paddingleft-1
        content1 =html1.xpath('string(//ul[@class="custom_dot  para-list list-paddingleft-1"])')
        print(content1)
        if len(content1)==0:
            print('未找到解释')
            content1 = '未找到解释'
            return content1
    else:
        content2 =content1[0].xpath ('string(.)').replace('&nbsp','').replace('\n','')
        return content2
        print(content2)
def words_href():
    # print(dbgethotwords())
    data = dbgethotwords()
    name = data[0]
    type = data[1]
    print(name)
    #遍历新闻，然后与这些热词比对，记得带上type一起插入数据库
    for i in range(0, 50):
        print(i)
        page = i + 1
        url = "https://news.cnblogs.com/n/page/" + str(page)
        r = requests.get(url)
        html = r.content.decode("utf-8")
        html1 = etree.HTML(html)
        href = html1.xpath('//h2[@class="news_entry"]/a/@href')
        title = html1.xpath('//h2[@class="news_entry"]/a/text()')
        for a in range(0,30):
            getHref(href[a],title[a],name,type)
        # print(len(href))
def getHref(href,title,name,type):
    print(title)
    url = "https://news.cnblogs.com" + href
    for i in range(0,len(name)):
        m = title.find(name[i])
        if m != -1:
            dbinsertHref(title,url,name[i],type[i])
        else:
            pass
if __name__=='__main__':
    #遍历得到不同新闻类型链接尾部数字
    #loopNewsType()
    #热词解释
    #wordsExplain()
    #热词引用链接
    words_href()