微博爬虫
应该看得懂吧,把cookie换成自己的就好了。
#!/usr/bin/env python
#coding=utf8
""" Simulate a user login to Sina Weibo with cookie.
You can use this method to visit any page that requires login.
"""
import urllib2
import sys
import re
import time
cookie = '' # get your cookie from Chrome or Firefox
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'cookie': cookie
}
C = open("Res.txt","w")
def visit(url):
time.sleep(1)
req = urllib2.Request(url, headers=headers)
text = urllib2.urlopen(req).read()
text2 = ""
for x in text:
if x == '\\':
continue
text2 += x
# print the title, check if you login to weibo sucessfully
pat_title = re.compile(r'<a target="_blank" href="(.+?)</a>')
List = re.findall(pat_title,text2)
for Text in List:
C.write(Text+'\n')
f = open("WeiboUrl.txt") # 返回一个文件对象
line = f.readline() # 调用文件的 readline()方法
while line:
print line, # 后面跟 ',' 将忽略换行符
# print(line, end = '') # 在 Python 3中使用
visit(line)
line = f.readline()