利用Python爬取大学官网上边的精品文章(对初学者很友好)
一、需求分析
1.包含发布⽇期,作者,标题,阅读数以及正⽂。
2.可⾃动翻⻚。
3.范围:2020年内
二、实现代码
def import_mysql(data_list): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders', charset='utf8') cursor = db.cursor() # 如果数据表已经存在使用execute()方法删除表。 cursor.execute("DROP TABLE IF EXISTS novel") sql = 'CREATE TABLE IF NOT EXISTS novel (' \ 'date VARCHAR(255) NOT NULL, ' \ 'title VARCHAR(255) NOT NULL, ' \ 'author VARCHAR(255) NOT NULL, ' \ 'read_count VARCHAR(255) NOT NULL,' \ 'content VARCHAR(10000) NOT NULL)' cursor.execute(sql) table = 'novel' for data in data_list: keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) try: if cursor.execute(sql, tuple(data.values())): print('Successful import mysql') db.commit() except: print('Failed import mysql') db.rollback() db.close() def __init__(self): self.__head = None self.__node_be_inserted = None self.__node_be_moved = None #采用函数层次架构 #层次一: def traverse_get_be_inserted_node(self): # find be inserted node ok first_node = self.__head bottom_child_node_list = [] def traverse_heap(current_node): # recursive function if (current_node.get_child_node("left") == None) | (current_node.get_child_node("left") == None): bottom_child_node_list.append(current_node) if current_node.get_child_node("left") != None: traverse_heap(current_node.get_child_node("left")) if current_node.get_child_node("right") != None: traverse_heap(current_node.get_child_node("right")) traverse_heap(self.__head) min_priority_number = 100000 # max priority min_priority_node = None for ele_instance in bottom_child_node_list: if ele_instance.get_insert_priority() < min_priority_number: min_priority_number = ele_instance.get_insert_priority() min_priority_node = ele_instance self.__node_be_inserted = min_priority_node def value_float(self, newNode):# min value float ok currentNode = newNode currentNodeValue = currentNode.get_node_value() currentParentNode = currentNode.get_parent_node() if currentParentNode == None: return "已经上浮到 root" currentParentNodeValue = currentParentNode.get_node_value() while currentNodeValue < currentParentNodeValue: currentNode.set_node_value(currentParentNodeValue) currentParentNode.set_node_value(currentNodeValue) currentNode = currentParentNode currentNodeValue = currentNode.get_node_value() currentParentNode = currentNode.get_parent_node() if currentParentNode == None: #print("值已经上浮根节点值为{}".format(self.__head.get_node_value())) return "已经上浮到root" currentParentNodeValue = currentParentNode.get_node_value() """ doubleParentNode = parentNode.get_parent() # 2020.11.26 #交换 node_value 对象之间的链接关系不发生改变 def judge_left_or_right(parentNode, newNode): if parentNode.get_child_node("left") == newNode: return "left" else: return "right" result_one = judge_left_or_right(parentNode, newNodeValue) result_two = judge_left_or_right(doubleParentNode, parentNode) """ def traverse_get_be_moved_node(self): # ok #改进方案:与traverse_get_be_inserted_node()函数代码复用程度极高可用类继承减少复用 first_node = self.__head bottom_child_node_list = [] def traverse_heap(current_node): # recursive function if (current_node.get_child_node("left") == None) | (current_node.get_child_node("right") == None):## error ######## bottom_child_node_list.append(current_node) if current_node.get_child_node("left") != None: traverse_heap(current_node.get_child_node("left")) if current_node.get_child_node("right") != None: traverse_heap(current_node.get_child_node("right")) traverse_heap(self.__head) max_priority_number = -1 # max priority max_priority_node = None for ele_instance in bottom_child_node_list: if ele_instance.get_insert_priority() > max_priority_number: max_priority_number = ele_instance.get_insert_priority() max_priority_node = ele_instance self.__node_be_moved = max_priority_node #print("被移动节点值为{}, priority值为{}".format(self.__node_be_moved.get_node_value(), self.__node_be_moved.get_insert_priority())) def value_down(self): current_node = self.__head while current_node.get_child_node("left") != None and current_node.get_child_node("right") != None: current_tuple = (current_node, current_node.get_node_value()) #current_node, current_node_value current_left_child_node = current_node.get_child_node("left") current_left_tuple = (current_left_child_node, current_left_child_node.get_node_value()) #current_left_child_node, current_left_child_node_value current_right_child_node = current_node.get_child_node("right") current_right_tuple = (current_right_child_node, current_right_child_node.get_node_value()) #current_right_child_node, current_right_child_node_value current_tuple_list = [current_tuple, current_left_tuple, current_right_tuple] sequence_current_tuple_list = [] for ele_tuple in current_tuple_list: if len(sequence_current_tuple_list) == 0: sequence_current_tuple_list.append(ele_tuple) elif len(sequence_current_tuple_list) == 1: if ele_tuple[1] < sequence_current_tuple_list[0][1]: sequence_current_tuple_list.insert(0, ele_tuple) else: sequence_current_tuple_list.append(ele_tuple) else: #len(sequence_current_tuple_list) == 2 if ele_tuple[1] < sequence_current_tuple_list[0][1]: sequence_current_tuple_list.insert(0, ele_tuple) elif ele_tuple[1] > sequence_current_tuple_list[1][1]: sequence_current_tuple_list.append(ele_tuple) else: sequence_current_tuple_list.insert(1, ele_tuple) current_node_index = sequence_current_tuple_list.index(current_tuple) current_min_child_node = sequence_current_tuple_list[0][0] if current_node_index >= 1: current_min_child_node_value = current_min_child_node.get_node_value() current_node = current_tuple[0] current_node_value = current_tuple[1] current_node.set_node_value(current_min_child_node_value) current_min_child_node.set_node_value(current_node_value) current_node = current_min_child_node else: return "value down successful" if current_node.get_child_node("left") != None and current_node.get_child_node("right") == None: current_node_value = current_node.get_node_value() current_child_node = current_node.get_child_node("left") current_child_node_value = current_child_node.get_node_value() if current_node_value > current_child_node_value: current_child_node.set_node_value(current_node_value) current_node.set_node_value(current_child_node_value) return "value down successful" if current_node.get_child_node("right") != None and current_node.get_child_node("left") == None: current_node_value = current_node.get_node_value() current_child_node = current_node.get_child_node("right") current_child_node_value = current_child_node.get_node_value() if current_node_value > current_child_node_value: current_child_node.set_node_value(current_node_value) current_node.set_node_value(current_child_node_value) return "value down successful" return "value down successful" #层次二 def insert(self, value): if self.__head == None: # start first node newNode = Node(value, None, 0) self.__head = newNode self.__node_be_inserted = newNode else: parent_node = self.__node_be_inserted newNode = Node(value, parent_node, parent_node.get_insert_priority()+1) if parent_node.get_child_node("left") == None: parent_node.append_child_node("left", newNode) elif parent_node.get_child_node("right") == None: parent_node.append_child_node("right", newNode) else: print("得到的self.__node_be_inserted出现错误") self.value_float(newNode) self.traverse_get_be_inserted_node() def deleMin(self): if self.__head == None: return None if (self.__head.get_child_node("left") == None) and (self.__head.get_child_node("right") == None): minValue = self.__head.get_node_value() self.__head = None return minValue def judge_left_or_right(parentNode, childNode): if parentNode.get_child_node("left") == childNode: return "left" else: return "right" min_node = self.__head minValue = min_node.get_node_value() self.traverse_get_be_moved_node() be_moved_node = self.__node_be_moved min_node.set_node_value(be_moved_node.get_node_value()) parentNode = be_moved_node.get_parent_node() result = judge_left_or_right(parentNode, be_moved_node) parentNode.delete_child_node(result) self.value_down() return minValue #层次三 def buildHeap(self, _list): _list.sort() for i in _list: self.insert(i) #函数层次架构外 def findMin(self): minValue = self.__head.get_node_value() return minValue @property def isEmpty(self): if self.__head == None: return "True" else: return "False" @property def size(self): first_node = self.__head def traverse_heap(current_node): # recursive function if current_node != None: #print("执行一次,当前节点值为", current_node.get_node_value()) return 1 + traverse_heap(current_node. get_child_node("left")) + traverse_heap(current_node.get_child_node("right")) else: return 0 if first_node == None: return 0 else: return traverse_heap(first_node) #递归方法递归堆中每一个节点求出堆中key的个数
三、效果展示
搞定,还望各位大佬不吝赐教~