X-man

导航

爬取博客数据

 

 

 

#coding:utf-8

import urllib
import time

url = ['']*350
page = 1
link = 1
while page <= 7:
    con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
    i = 0
    title = con.find(r'<a title=')
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)

    while title != -1 and href != -1 and html != -1 and i < 50:
        url[i] = con[href + 6 : html + 5]
        print link, url[i]
        content = urllib.urlopen(url[i]).read()
        open(r'hanhan/'+url[i][-26:],'w+').write(content)
        print 'downloading', url[i]
        time.sleep(1)
        title = con.find(r'<a title=', html)
        href = con.find(r'href=', title)
        html = con.find(r'.html', href)
        i = i + 1
        link = link + 1
    else:
        print page,'find end!'
    page = page + 1
else:
    print 'all find end'
    print 'all find end'

 

posted on 2016-04-24 10:49  雨钝风轻  阅读(350)  评论(0编辑  收藏  举报