简单小爬虫

最近在学爬虫,网上有太多拿糗事百科做例子的了。。。但很多都千篇一律,我实际拿下来运行一下,发现早已不能用了。。。

下面是我自己改写的,就简单抓下文章:

 

#coding:utf-8
import requests
import re
import time
# import thread

times = 0
def retry(method):

def fun(*args):
global times
try:
result = method(*args)
if times>5:
times = 0
return False

return result
except Exception as e:
times += 1
time.sleep(0.5)
fun(*args)
return fun

class Spider_Model:

def __init__(self):
self.page = 1
self.pages = []
self.apage = []

# @retry
def GetArticleFromSource(self, page):
url = "https://www.qiushibaike.com/hot/page/" + page + "/"
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {
'User-Agent':user_agent
}
response = requests.get(url=url, headers=headers)
mPage = response.text
self.article = re.findall('<a href="/article/(\d*)".*>', mPage)
for p in self.article:
if p not in self.apage:
self.apage.append(p)

@retry
def GetTargetPageContent(self, articlePage):

url = "https://www.qiushibaike.com/article/"+articlePage
headers = {
'User-Agent': "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
}
response = requests.get(url, headers)

content = re.findall('<div class="content">(.*?)</div>', response.text, re.S)
if len(content) == 0:
assert False
return content


if __name__ == "__main__":
mModel = Spider_Model()
for page in range(1,14):
mModel.GetArticleFromSource(str(page))
article_page_list = mModel.apage
for i in article_page_list:
content = mModel.GetTargetPageContent(str(i))
if type(content) == bool:
continue
elif content == None:
continue
elif len(content) != 0:
print content[0]
else:
continue
posted @ 2017-08-29 09:22  北城以北花似海  阅读(135)  评论(0编辑  收藏  举报