爬虫学习之汽车之家
# coding: utf-8 """ Created on Mon Dec 24 19:24:35 2018 @author: gz91061 """ # import sys # reload(sys) # sys.setdefaultencoding('utf-8') #导入:SQLAlchemy框架 from sqlalchemy import Column, String, create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import VARCHAR, Integer import re import requests import time '''创建数据库表格''' Base = declarative_base() #创建对象的基类: engine = create_engine('postgresql+psycopg2://postgres:martine0703@127.0.0.1/luowenxing') #初始化数据库的连接: session = sessionmaker(bind=engine) #创建session类型 sess = session() metadata = Base.metadata class pcauto_baike(Base): __tablename__ = 'pcauto_baike' #表的名字: keyID = Column(Integer(), primary_key=True) # 表的结构:自增序列 url = Column(String()) pageSource = Column(VARCHAR()) crawl_time = Column(VARCHAR()) flag = Column(Integer()) metadata.create_all(engine) #爬取当前时间 def time_capture(): return time.strftime('%Y.%m.%d %H:%M:%S ', time.localtime(time.time())) def headers(): header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'Content-Type': 'text/html; charset=UTF-8' } return header #爬取网址 def fetch(url): header = headers() data = requests.get(url, headers=header) data.encoding = 'utf-8' # 根据网页内容进行修改 data = data.text print('%s抓取成功' % url) time.sleep(7) return data def check(url): temp = sess.query(pcauto_baike.pageSource).filter(pcauto_baike.url == url).first() if not temp: return True def save(obj): sess.add(obj) sess.commit() print('存储成功') def read_pageSource(flag): read_pageSource = sess.query(pcauto_baike).filter(pcauto_baike.flag == flag).all() return read_pageSource def flag(url): if 'p' in url[-5:]: return 3 elif 'cl' in url: return 1 elif 'cs' in url: return 2 else: return 4 def pagenumber(pagesource): if '尾页' in pagesource: pat1 = '<a href="//baike.pcauto.com.cn/cs.*?p(.*?)/" class="next">尾页</a>' num = re.findall(pat1, pagesource)[0] return int(num) elif '下一页' in pagesource: return 2 else: return 1 def parse_0(): pagesource0 = read_pageSource(0)[0].pageSource pat_urllist = '<a href="(//baike.pcauto.com.cn/\S{,5})" target="_blank">\S.*?\n' urllist = re.compile(pat_urllist).findall(pagesource0) for i in urllist: url = 'https:' + i check_data = check(url) if check_data: pagesource = fetch(url) time_c = time_capture() flag_url = flag(url) obj = pcauto_baike(url=url, pageSource=pagesource, crawl_time=time_c, flag=flag_url) save(obj) else: print('%s已存在' % url) def parse_2(): pagesource2 = read_pageSource(2) for page_s in pagesource2: url_2 = page_s.url[:-1] totalpage = pagenumber(page_s.pageSource) for i in range(totalpage): url_3 = url_2 + 'p' + str(i + 1) + '/' check_data = check(url_3) if check_data: page_3 = fetch(url_3) time_c = time_capture() flag_url = flag(url_3) obj = pcauto_baike(url=url_3, pageSource=page_3, crawl_time=time_c, flag=flag_url) save(obj) else: print('%s已存在' % url_3) page_3 = sess.query(pcauto_baike.pageSource).filter(pcauto_baike.url == url_3).first()[0] pat_url = '<a href="(//baike.pcauto.com.cn/.*?html)" target="_blank">.*?</a>' urllist = re.compile(pat_url).findall(page_3) for u in urllist: u_sub = 'https:%s' % u check_data = check(u_sub) if check_data: page_4 = fetch(u_sub) time_c = time_capture() flag_url = flag(u_sub) obj = pcauto_baike(url=u_sub, pageSource=page_4, crawl_time=time_c, flag=flag_url) save(obj) else: print('%s已存在' % u_sub) def start(): url = 'https://baike.pcauto.com.cn/' check_data = check(url) if check_data: page_0 = fetch(url) time_c = time_capture() obj = pcauto_baike(url=url, pageSource=page_0, crawl_time=time_c, flag=0) save(obj) else: print('%s已存在' % url) parse_0() parse_2() if __name__ == '__main__': start() sess.close()
感觉自己已经深深爱上博客园,好好利用这个平台,记录学习的每一个点滴!