微博爬虫

应该看得懂吧,把cookie换成自己的就好了。

#!/usr/bin/env python
#coding=utf8


""" Simulate a user login to Sina Weibo with cookie.
You can use this method to visit any page that requires login.
"""

import urllib2
import sys
import re
import time

cookie = ''  # get your cookie from Chrome or Firefox
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
    'cookie': cookie
}

C = open("Res.txt","w")

def visit(url):
    time.sleep(1)
    req = urllib2.Request(url, headers=headers)
    text = urllib2.urlopen(req).read()
    text2 = ""
    for x in text:
        if x == '\\':
            continue
        text2 += x
        
    # print the title, check if you login to weibo sucessfully
    pat_title = re.compile(r'<a target="_blank" href="(.+?)</a>')
    List = re.findall(pat_title,text2)
    for Text in List:
        C.write(Text+'\n')

f = open("WeiboUrl.txt")             # 返回一个文件对象  
line = f.readline()             # 调用文件的 readline()方法  
while line:  
    print line,                 # 后面跟 ',' 将忽略换行符  
    # print(line, end = '')   # 在 Python 3中使用  
    visit(line)
    line = f.readline()
posted @ 2017-05-31 16:36  qscqesze  阅读(436)  评论(0编辑  收藏  举报