Qt超

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

这是一个用于获取物理师会议报告的简单爬虫,数据库表结构正在不断完善中

爬虫信息:

  1 # -*- coding:utf-8 -*-
  2 import urllib.request
  3 import pymysql
  4 from bs4 import BeautifulSoup
  5 import requests
  6 import time
  7 import re
  8 import os
  9 
 10 # 数据库连接基础类
 11 class Conn_Mssql:
 12     #查询Mysql数据库
 13     def Select_mssql(strsql):
 14         #数据库连接信息
 15         conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8")
 16         cur = conn.cursor()
 17         cur.execute(strsql)
 18         return cur
 19     #插入与更新数据库
 20     def InsertOrUpdate_mssql(strsql):
 21         # 数据库连接信息
 22         conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8")
 23         cur = conn.cursor()
 24         cur.execute(strsql)
 25         conn.commit()
 26         conn.close()
 27         return cur
 28 
 29 #获取网络信息中的信息,并存储
 30 class Get_HttpMessage:
 31     # 下载文件
 32     def getFile(url):
 33         try:
 34             file_name = url.split('/')[-1]
 35             file_path = "StorePDF\\"+file_name
 36             u = urllib.request.urlopen(url)
 37         except :
 38             print(url, "url file not found")
 39             return
 40         block_sz = 90192
 41         with open(file_path, 'wb') as f:
 42             while True:
 43                 buffer = u.read(block_sz)
 44                 if buffer:
 45                     f.write(buffer)
 46                 else:
 47                     break
 48         print("Sucessful to download" + " " + file_name)
 49     #开始获取网络信息
 50     def startGet():
 51         print('start')
 52         #链接的APPM网络
 53         url = "https://www.aapm.org/pubs/reports/"
 54         request = urllib.request.Request(url)
 55         response = urllib.request.urlopen(request)
 56         data = response.read()
 57         soup = BeautifulSoup(data,"lxml")
 58         #href属性包含docid字符串
 59         for link in soup.find_all(href=re.compile("docid")):
 60             #地址值
 61             text_url = link['href']
 62             #地址名称
 63             text_Name = link.get_text()
 64             if len(text_url)>0 and len(text_Name)>10 :
 65                 strSQl = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name + "','" +url+ text_url + "')"
 66                 strSQl =strSQl.encode('utf8')
 67                 try:
 68                     #存储地址信息
 69                     Conn_Mssql.InsertOrUpdate_mssql(strSQl)
 70                 except:
 71                     print('母页面MySQL存储失败')
            
72 time.sleep(1) 73 #含有论文的网页地址 74 urlSecond = url + text_url 75 request2 = urllib.request.Request(urlSecond) 76 response2 = urllib.request.urlopen(request2) 77 data2 = response2.read() 78 soup2 = BeautifulSoup(data2, "lxml") 79 #此变量用于消除重复的PDF信息 80 pdfName = "" 81 #查询网页中的PDF信息 82 for link2 in soup2.find_all(href=re.compile("pdf")): 83 #PDF信息 84 text_url2 = link2['href'] 85 #PDF的所在网页来源 86 text_Name2 = url + text_url 87 if len(text_url2) > 0 and pdfName != text_url2: 88 pdfName = text_url2 89 strSQl2 = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name2 + "','" + text_url2 + "')" 90 strSQl2 = strSQl2.encode('utf8') 91 try: 92 #存储PDF信息至数据库 93 Conn_Mssql.InsertOrUpdate_mssql(strSQl2) 94 #慢一点,减缓网站压力 95 time.sleep(1) 96 #下载论文中的PDF文件 97 Get_HttpMessage.getFile(text_url2) 98 except: 99 print('子页面MySQL存储失败') 100 #程序入口 101 Get_HttpMessage.startGet()

这是用于存储的数据库表结构

 1 /*
 2 Navicat MySQL Data Transfer
 3 
 4 Source Server         : dde
 5 Source Server Version : 50624
 6 Source Host           : DESKTOP-V9MQNL6:3306
 7 Source Database       : internetdaq
 8 
 9 Target Server Type    : MYSQL
10 Target Server Version : 50624
11 File Encoding         : 65001
12 
13 
14 */
15 
16 SET FOREIGN_KEY_CHECKS=0;
17 
18 -- ----------------------------
19 -- Table structure for daqtest
20 -- ----------------------------
21 DROP TABLE IF EXISTS `daqtest`;
22 CREATE TABLE `daqtest` (
23   `ID` bigint(20) NOT NULL AUTO_INCREMENT,
24   `SAVE_TIME` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
25   `URL_Name` varchar(600) COLLATE utf8_unicode_ci DEFAULT NULL,
26   `URL_Link` varchar(6000) COLLATE utf8_unicode_ci DEFAULT NULL,
27   PRIMARY KEY (`ID`)
28 ) ENGINE=InnoDB AUTO_INCREMENT=4634 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;

 

posted on 2018-01-21 17:22  Qt超  阅读(319)  评论(0编辑  收藏  举报