利用Python爬取大学官网上边的精品文章(对初学者很友好)

一、需求分析

1.包含发布⽇期,作者,标题,阅读数以及正⽂。

2.可⾃动翻⻚。

3.范围:2020年内

二、实现代码

def import_mysql(data_list):
    db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders', charset='utf8')
    cursor = db.cursor()

    # 如果数据表已经存在使用execute()方法删除表。
    cursor.execute("DROP TABLE IF EXISTS novel")

    sql = 'CREATE TABLE IF NOT EXISTS novel (' \
          'date VARCHAR(255) NOT NULL, ' \
          'title VARCHAR(255) NOT NULL, ' \
          'author VARCHAR(255) NOT NULL, ' \
          'read_count VARCHAR(255) NOT NULL,' \
          'content VARCHAR(10000) NOT NULL)'
    cursor.execute(sql)
    table = 'novel'
    for data in data_list:
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
        try:
            if cursor.execute(sql, tuple(data.values())):
                print('Successful import mysql')
                db.commit()
        except:
            print('Failed import mysql')
            db.rollback()
    db.close()
def __init__(self):
    self.__head = None
    self.__node_be_inserted = None
    self.__node_be_moved = None
#采用函数层次架构

#层次一:
def traverse_get_be_inserted_node(self): # find be inserted node                         ok
    first_node = self.__head
    bottom_child_node_list = []

    def traverse_heap(current_node):  # recursive function
        if (current_node.get_child_node("left") == None) | (current_node.get_child_node("left") == None):
            bottom_child_node_list.append(current_node)
        if current_node.get_child_node("left") != None:
            traverse_heap(current_node.get_child_node("left"))
        if current_node.get_child_node("right") != None:
            traverse_heap(current_node.get_child_node("right"))

    traverse_heap(self.__head)
    min_priority_number = 100000  # max priority
    min_priority_node = None
    for ele_instance in bottom_child_node_list:
        if ele_instance.get_insert_priority() < min_priority_number:
            min_priority_number = ele_instance.get_insert_priority()
            min_priority_node = ele_instance
    self.__node_be_inserted = min_priority_node

def value_float(self, newNode):# min value float                                                 ok
    currentNode = newNode
    currentNodeValue = currentNode.get_node_value()
    currentParentNode = currentNode.get_parent_node()
    if currentParentNode == None:
        return "已经上浮到 root"
    currentParentNodeValue = currentParentNode.get_node_value()
    while currentNodeValue < currentParentNodeValue:
        currentNode.set_node_value(currentParentNodeValue)
        currentParentNode.set_node_value(currentNodeValue)
        currentNode = currentParentNode

        currentNodeValue = currentNode.get_node_value()
        currentParentNode = currentNode.get_parent_node()
        if currentParentNode == None:
            #print("值已经上浮根节点值为{}".format(self.__head.get_node_value()))
            return "已经上浮到root"
        currentParentNodeValue = currentParentNode.get_node_value()
    """
    doubleParentNode = parentNode.get_parent() # 2020.11.26 #交换 node_value  对象之间的链接关系不发生改变
    def judge_left_or_right(parentNode, newNode):
        if parentNode.get_child_node("left") == newNode:
            return "left"
        else:
            return "right"
    result_one = judge_left_or_right(parentNode, newNodeValue)
    result_two = judge_left_or_right(doubleParentNode, parentNode)
    """
def traverse_get_be_moved_node(self):  #                                                ok
    #改进方案:与traverse_get_be_inserted_node()函数代码复用程度极高可用类继承减少复用
    first_node = self.__head
    bottom_child_node_list = []

    def traverse_heap(current_node):  # recursive function
        if (current_node.get_child_node("left") == None) | (current_node.get_child_node("right") == None):## error ########
            bottom_child_node_list.append(current_node)
        if current_node.get_child_node("left") != None:
            traverse_heap(current_node.get_child_node("left"))
        if current_node.get_child_node("right") != None:
            traverse_heap(current_node.get_child_node("right"))
    traverse_heap(self.__head)
    max_priority_number = -1  # max priority
    max_priority_node = None
    for ele_instance in bottom_child_node_list:
        if ele_instance.get_insert_priority() > max_priority_number:
            max_priority_number = ele_instance.get_insert_priority()
            max_priority_node = ele_instance
    self.__node_be_moved = max_priority_node
    #print("被移动节点值为{}, priority值为{}".format(self.__node_be_moved.get_node_value(), self.__node_be_moved.get_insert_priority()))

def value_down(self):
    current_node = self.__head
    while current_node.get_child_node("left") != None and current_node.get_child_node("right") != None:
        current_tuple = (current_node, current_node.get_node_value()) #current_node, current_node_value

        current_left_child_node = current_node.get_child_node("left")
        current_left_tuple = (current_left_child_node, current_left_child_node.get_node_value()) #current_left_child_node, current_left_child_node_value

        current_right_child_node = current_node.get_child_node("right")
        current_right_tuple = (current_right_child_node, current_right_child_node.get_node_value()) #current_right_child_node, current_right_child_node_value

        current_tuple_list = [current_tuple, current_left_tuple, current_right_tuple]
        sequence_current_tuple_list = []
        for ele_tuple in current_tuple_list:
            if len(sequence_current_tuple_list) == 0:
                sequence_current_tuple_list.append(ele_tuple)
            elif len(sequence_current_tuple_list) == 1:
                if ele_tuple[1] < sequence_current_tuple_list[0][1]:
                    sequence_current_tuple_list.insert(0, ele_tuple)
                else:
                    sequence_current_tuple_list.append(ele_tuple)
            else: #len(sequence_current_tuple_list) == 2
                if ele_tuple[1] < sequence_current_tuple_list[0][1]:
                    sequence_current_tuple_list.insert(0, ele_tuple)
                elif ele_tuple[1] > sequence_current_tuple_list[1][1]:
                    sequence_current_tuple_list.append(ele_tuple)
                else:
                    sequence_current_tuple_list.insert(1, ele_tuple)
        current_node_index = sequence_current_tuple_list.index(current_tuple)
        current_min_child_node = sequence_current_tuple_list[0][0]
        if current_node_index >= 1:
            current_min_child_node_value = current_min_child_node.get_node_value()
            current_node = current_tuple[0]
            current_node_value = current_tuple[1]

            current_node.set_node_value(current_min_child_node_value)
            current_min_child_node.set_node_value(current_node_value)
            current_node = current_min_child_node
        else:
            return "value down successful"
    if current_node.get_child_node("left") != None and current_node.get_child_node("right") == None:
        current_node_value = current_node.get_node_value()
        current_child_node = current_node.get_child_node("left")
        current_child_node_value = current_child_node.get_node_value()

        if current_node_value > current_child_node_value:
            current_child_node.set_node_value(current_node_value)
            current_node.set_node_value(current_child_node_value)
        return "value down successful"
    if current_node.get_child_node("right") != None and current_node.get_child_node("left") == None:
        current_node_value = current_node.get_node_value()
        current_child_node = current_node.get_child_node("right")
        current_child_node_value = current_child_node.get_node_value()

        if current_node_value > current_child_node_value:
            current_child_node.set_node_value(current_node_value)
            current_node.set_node_value(current_child_node_value)
            return "value down successful"
    return  "value down successful"
#层次二
def insert(self, value):
    if self.__head == None: # start first node
        newNode = Node(value, None, 0)
        self.__head = newNode
        self.__node_be_inserted = newNode
    else:
        parent_node = self.__node_be_inserted
        newNode = Node(value, parent_node, parent_node.get_insert_priority()+1)
        if parent_node.get_child_node("left") == None:
            parent_node.append_child_node("left", newNode)
        elif parent_node.get_child_node("right") == None:
            parent_node.append_child_node("right", newNode)
        else:
            print("得到的self.__node_be_inserted出现错误")
        self.value_float(newNode)
        self.traverse_get_be_inserted_node()

def deleMin(self):
    if self.__head == None:
        return None
    if (self.__head.get_child_node("left") == None) and (self.__head.get_child_node("right") == None):
        minValue = self.__head.get_node_value()
        self.__head = None
        return minValue

    def judge_left_or_right(parentNode, childNode):
        if parentNode.get_child_node("left") == childNode:
            return "left"
        else:
            return "right"
    min_node = self.__head
    minValue = min_node.get_node_value()
    self.traverse_get_be_moved_node()
    be_moved_node = self.__node_be_moved
    min_node.set_node_value(be_moved_node.get_node_value())
    parentNode = be_moved_node.get_parent_node()

    result = judge_left_or_right(parentNode, be_moved_node)
    parentNode.delete_child_node(result)
    self.value_down()
    return minValue


#层次三
def buildHeap(self, _list):
    _list.sort()
    for i in _list:
        self.insert(i)

#函数层次架构外

def findMin(self):
    minValue  = self.__head.get_node_value()
    return minValue

@property
def isEmpty(self):
    if self.__head == None:
        return "True"
    else:
        return "False"

@property
def size(self):
    first_node = self.__head
    def traverse_heap(current_node):  # recursive function
        if current_node != None:
           #print("执行一次,当前节点值为", current_node.get_node_value())
           return 1 + traverse_heap(current_node. get_child_node("left")) + traverse_heap(current_node.get_child_node("right"))
        else:
            return 0
    if first_node == None:
        return 0
    else:
        return traverse_heap(first_node)
    #递归方法递归堆中每一个节点求出堆中key的个数

三、效果展示

搞定,还望各位大佬不吝赐教~

posted @ 2020-11-30 12:12  不善言谈者2018  阅读(226)  评论(0编辑  收藏  举报