爬虫知识
-
def progress(blk,blk_size,total_size):#当前下载量,每个量的大小,整个大小
print "aa=%d\nbb=%d\ncc=%f%%"%(blk*blk_size,total_size,blk*blk_size/total_size)
filename,headers=urllib.urlretrieve("http://www.baidu.com","111.html",reporthook=progress) 自动创建文件并保存爬取的内容,显示下载进度 -
import urlparse
url="https://www.baidu.com/s?wd=bj&rsv_spt=1&rsv_iqid=0xc00e5ff600010317"
hh=urlparse.urlparse(url)
print hh
jj=urlparse.parse_qs(hh.query)
print jj 解析url参数 -
import cookielib
url='https://www.baidu.com/'
request=urllib2.Request(url)
cook=cookielib.CookieJar()
cookhander=urllib2.HTTPCookieProcessor(cookiejar=cook)
opener=urllib2.build_opener(cookhander,urllib2.HTTPSHandler(debuglevel=1))
urllib2.install_opener(opener)
g=opener.open(request)
print g.read(100) print cook._cookies#获取cookies 注册自定义opener,使用自定义功能