爬虫学习之汽车之家

# coding: utf-8
"""
Created on Mon Dec 24 19:24:35 2018

@author: gz91061
"""

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

#导入:SQLAlchemy框架
from sqlalchemy import Column, String, create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

from sqlalchemy import VARCHAR, Integer
import re
import requests
import time

'''创建数据库表格'''
Base = declarative_base()   #创建对象的基类:
engine = create_engine('postgresql+psycopg2://postgres:martine0703@127.0.0.1/luowenxing')   #初始化数据库的连接:
session = sessionmaker(bind=engine) #创建session类型
sess = session()
metadata = Base.metadata


class pcauto_baike(Base):
    __tablename__ = 'pcauto_baike'  #表的名字:
    keyID = Column(Integer(), primary_key=True)  # 表的结构:自增序列
    url = Column(String())
    pageSource = Column(VARCHAR())
    crawl_time = Column(VARCHAR())
    flag = Column(Integer())
metadata.create_all(engine)

#爬取当前时间
def time_capture():
    return time.strftime('%Y.%m.%d %H:%M:%S ', time.localtime(time.time()))



def headers():
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'Content-Type': 'text/html; charset=UTF-8'
    }
    return header

#爬取网址
def fetch(url):
    header = headers()
    data = requests.get(url, headers=header)
    data.encoding = 'utf-8'  # 根据网页内容进行修改
    data = data.text
    print('%s抓取成功' % url)
    time.sleep(7)
    return data


def check(url):
    temp = sess.query(pcauto_baike.pageSource).filter(pcauto_baike.url == url).first()
    if not temp:
        return True


def save(obj):
    sess.add(obj)
    sess.commit()
    print('存储成功')


def read_pageSource(flag):
    read_pageSource = sess.query(pcauto_baike).filter(pcauto_baike.flag == flag).all()
    return read_pageSource


def flag(url):
    if 'p' in url[-5:]:
        return 3
    elif 'cl' in url:
        return 1
    elif 'cs' in url:
        return 2
    else:
        return 4


def pagenumber(pagesource):
    if '尾页' in pagesource:
        pat1 = '<a href="//baike.pcauto.com.cn/cs.*?p(.*?)/" class="next">尾页</a>'
        num = re.findall(pat1, pagesource)[0]
        return int(num)
    elif '下一页' in pagesource:
        return 2
    else:
        return 1


def parse_0():
    pagesource0 = read_pageSource(0)[0].pageSource
    pat_urllist = '<a href="(//baike.pcauto.com.cn/\S{,5})" target="_blank">\S.*?\n'
    urllist = re.compile(pat_urllist).findall(pagesource0)
    for i in urllist:
        url = 'https:' + i
        check_data = check(url)
        if check_data:
            pagesource = fetch(url)
            time_c = time_capture()
            flag_url = flag(url)
            obj = pcauto_baike(url=url, pageSource=pagesource, crawl_time=time_c, flag=flag_url)
            save(obj)
        else:
            print('%s已存在' % url)


def parse_2():
    pagesource2 = read_pageSource(2)
    for page_s in pagesource2:
        url_2 = page_s.url[:-1]
        totalpage = pagenumber(page_s.pageSource)
        for i in range(totalpage):
            url_3 = url_2 + 'p' + str(i + 1) + '/'
            check_data = check(url_3)
            if check_data:
                page_3 = fetch(url_3)
                time_c = time_capture()
                flag_url = flag(url_3)
                obj = pcauto_baike(url=url_3, pageSource=page_3, crawl_time=time_c, flag=flag_url)
                save(obj)
            else:
                print('%s已存在' % url_3)
            page_3 = sess.query(pcauto_baike.pageSource).filter(pcauto_baike.url == url_3).first()[0]
            pat_url = '<a href="(//baike.pcauto.com.cn/.*?html)" target="_blank">.*?</a>'
            urllist = re.compile(pat_url).findall(page_3)
            for u in urllist:
                u_sub = 'https:%s' % u
                check_data = check(u_sub)
                if check_data:
                    page_4 = fetch(u_sub)
                    time_c = time_capture()
                    flag_url = flag(u_sub)
                    obj = pcauto_baike(url=u_sub, pageSource=page_4, crawl_time=time_c, flag=flag_url)
                    save(obj)
                else:
                    print('%s已存在' % u_sub)


def start():
    url = 'https://baike.pcauto.com.cn/'
    check_data = check(url)
    if check_data:
        page_0 = fetch(url)
        time_c = time_capture()
        obj = pcauto_baike(url=url, pageSource=page_0, crawl_time=time_c, flag=0)
        save(obj)
    else:
        print('%s已存在' % url)
    parse_0()
    parse_2()


if __name__ == '__main__':
    start()
    sess.close()

感觉自己已经深深爱上博客园,好好利用这个平台,记录学习的每一个点滴!

posted @ 2019-01-07 18:17  冷锋战士  阅读(99)  评论(1编辑  收藏  举报