摘要: # -*- coding: utf-8 -*- import re import urllib import os.path def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html,p): reg = r'<img src="(htt... 阅读全文
posted @ 2016-11-29 23:13 brady-wang 阅读(342) 评论(0) 推荐(0) 编辑
摘要: #coding:utf8 import urllib2 __author__ = 'wang' class HtmlDownloader(object): def download(self, url): if url is None: return None response = urllib2.urlopen(url) ... 阅读全文
posted @ 2016-11-29 22:46 brady-wang 阅读(956) 评论(0) 推荐(0) 编辑
摘要: #coding:utf8 __author__ = 'wang' class HtmlOutputer(object): def __init__(self): self.datas = []; def collect_data(self, data): if data is None: return ... 阅读全文
posted @ 2016-11-29 22:45 brady-wang 阅读(460) 评论(0) 推荐(0) 编辑
摘要: #coding:utf8 import urlparse from bs4 import BeautifulSoup import re __author__ = 'wang' class HtmlParser(object): def parse(self, page_url, html_cont): if page_url is None or html_con... 阅读全文
posted @ 2016-11-29 22:44 brady-wang 阅读(686) 评论(0) 推荐(0) 编辑
摘要: spider_main.py 阅读全文
posted @ 2016-11-29 22:42 brady-wang 阅读(700) 评论(0) 推荐(0) 编辑
摘要: #coding:utf8 class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, url): if url is None: return... 阅读全文
posted @ 2016-11-29 22:42 brady-wang 阅读(887) 评论(0) 推荐(0) 编辑
摘要: import re from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse' 阅读全文
posted @ 2016-11-29 22:20 brady-wang 阅读(422) 评论(0) 推荐(0) 编辑
摘要: python scripts下 pip install beautifulsoup4 阅读全文
posted @ 2016-11-29 22:00 brady-wang 阅读(209) 评论(0) 推荐(0) 编辑
摘要: # -*- coding: utf-8 -*- import cookielib import urllib2 url = "http://www.baidu.com" print "第一种方法" response1 = urllib2.urlopen(url) print response1.getcode() print len(response1.read()) print "第二种方... 阅读全文
posted @ 2016-11-29 21:53 brady-wang 阅读(1662) 评论(0) 推荐(0) 编辑
摘要: sz /etc/sysconfig/network-scripts/ifcfg-eth1 阅读全文
posted @ 2016-11-29 09:16 brady-wang 阅读(388) 评论(0) 推荐(0) 编辑