Internet
0x01 URL的解析/反解析/连接
解析
urlparse()--分解URL
# -*- coding: UTF-8 -*- from urlparse import urlparse url = 'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag' parsed = urlparse(url) print parsed print parsed.scheme print parsed.netloc print parsed.path print parsed.params print parsed.query print parsed.fragment print parsed.username print parsed.password print parsed.hostname,'(netloc in lowercase)' print parsed.port
urlsplit()--替换urlparse(),但不会分解参数。(没有params属性)
# -*- coding: UTF-8 -*- from urlparse import urlsplit url = 'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag' parsed = urlsplit(url) print parsed print parsed.scheme print parsed.netloc print parsed.path print parsed.params print parsed.query print parsed.fragment print parsed.username print parsed.password print parsed.hostname,'(netloc in lowercase)' print parsed.port
urldefrag()--从URL中剥离出片段标识符
# -*- coding: UTF-8 -*- from urlparse import urldefrag url = 'http://NetLoc/path;param?query=arg#frag' print 'original :',url url,fragment = urldefrag(url) print 'url :',url print 'fragment :',fragment
反解析
geturl()--只适用于urlparse()或urlsplit()返回的对象
# -*- coding: UTF-8 -*- from urlparse import urlparse url = 'http://NetLoc/path;param?query=arg#frag' print 'original :',url parsed = urlparse(url) print 'after geturl() :',parsed.geturl()
urlunparse()--将包含串的普通元组拼接成一个URL(如果输入URL包含多余部分,重新构造的URL可能会将其去除)
# -*- coding: UTF-8 -*- from urlparse import urlparse,urlunparse url = 'http://NetLoc/path;param?query=arg#frag' print 'ORIGINAL URL:',url parsed = urlparse(url) print 'PARSED :',type(parsed),parsed t = parsed[:] print 'TUPLE :',type(t),t print 'NEW :',urlunparse(t)
连接
urljoin()--由相对片段构造绝对URL
# -*- coding: UTF-8 -*- from urlparse import urljoin print urljoin('http://www.example.com/path/file.html','anotherfile.html') print urljoin('http://www.example.com/path/file.html','../anotherfile.html')
# -*- coding: UTF-8 -*- from urlparse import urljoin print urljoin('http://www.example.com/path/','/subpath/file.html') print urljoin('http://www.example.com/path/','subpath/file.html')
注:如果连接到URL的路径以斜线开头(/),这会将URL的路径重置为顶级路径。如果不是以一个斜线开头,则追加到当前URL路径的末尾。
0x02 BaseHTTPServer--实现web服务器的基类
HTTP GET
下面一个示例展示了一个请求处理器如何向客户返回一个响应
1 # -*- coding: UTF-8 -*- 2 3 from BaseHTTPServer import BaseHTTPRequestHandler 4 import urlparse 5 6 class GetHandler (BaseHTTPRequestHandler): 7 def do_GET(self): 8 parsed_path = urlparse.urlparse(self.path) 9 message_parts = [ 10 'CLIENT VALUES:', 11 'client_address=%s (%s)' % (self.client_address, 12 self.address_string()), 13 'command=%s' % self.command, 14 'path=%s' % self.path, 15 'real_path=%s' % parsed_path.path, 16 'query=%s' % self.request_version, 17 '', 18 'SERVER VALUES:', 19 'server_version=%s' % self.server_version, 20 'sys_version=%s' % self.sys_version, 21 'protocol_version=%s' % self.protocol_version, 22 '', 23 'HEADERS RECEIVED:', 24 ] 25 for name,value in sorted(self.headers.items()): 26 message_parts.append('%s=%s' % (name,value.rstrip())) 27 message_parts.append('') 28 message = '\r\n'.join(message_parts) 29 self.send_response(200) 30 self.end_headers() 31 self.wfile.write(message) 32 return 33 34 if __name__ == '__main__': 35 from BaseHTTPServer import HTTPServer 36 server = HTTPServer(('localhost',8080),GetHandler) 37 print 'Starting server,use <Ctrl+C> to stop' 38 server.serve_forever()
HTTP POST
支持POST请求需要多做一些工作,因为基类不会自动解析表单数据。cgi模块提供了FieldStorage类,如果给定了正确的输入,它知道如何解析表单。
1 # -*- coding: UTF-8 -*- 2 3 from BaseHTTPServer import BaseHTTPRequestHandler 4 import cgi 5 6 7 class PostHandler(BaseHTTPRequestHandler): 8 def do_POST(self): 9 # parse the form data posted 10 form = cgi.FieldStorage( 11 fp=self.rfile, 12 headers=self.headers, 13 environ={'REQUEST_METHOD': 'POST', 14 'CONTENT_TYPE': self.headers['Content-Type'], 15 }) 16 17 # begin the response 18 self.send_response(200) 19 self.end_headers() 20 self.wfile.write('Client:%s\n' % str(self.client_address)) 21 self.wfile.write('User-agent:%s\n' % str(self.headers['user-agent'])) 22 self.wfile.write('Path:%s\n' % self.path) 23 self.wfile.write('Form data:\n') 24 25 # Echo back information about what was posted in the form 26 for field in form.keys(): 27 field_item = form[field] 28 if field_item.filename: 29 # the field contains an uploaded file 30 file_data = field_item.file.read() 31 file_len = len(file_data) 32 del file_data 33 self.wfile.write( 34 '\tUpload %s as "%s" (%d bytes)\n' % (field, field_item.filename, file_len)) 35 else: 36 # regular form values 37 self.wfile.write('\t%s=%s\n' % (field, form[field].value)) 38 39 return 40 41 42 if __name__ == '__main__': 43 from BaseHTTPServer import HTTPServer 44 server = HTTPServer(('localhost', 8080), PostHandler) 45 print 'starting sever,use <Ctrl+Z> to stop' 46 server.serve_forever()
线程与进程
HTTPServer是SocketServer.TCPServer的一个子类,不使用多线程或者多进程来处理请求。要增加线程或进程,需要使用相应的mix-in技术从SocketServer创建一个新类。
1 # -*- coding: UTF-8 -*- 2 3 from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler 4 from SocketServer import ThreadingMixIn 5 import threading 6 7 class Handler(BaseHTTPRequestHandler): 8 def do_GET(self): 9 self.send_response(200) 10 self.end_headers() 11 message=threading.currentThread().getName() 12 self.wfile.write(message) 13 self.wfile.write('\n') 14 return 15 16 class ThreadedHTTPServer(ThreadingMixIn,HTTPServer): 17 """Handler requests in a separate thread.""" 18 19 if __name__ == '__main__': 20 server = ThreadedHTTPServer(('localhost',8080),Handler) 21 print 'Starting server, use <Ctrl+C> to stop' 22 server.serve_forever()
处理错误
1 # -*- coding: UTF-8 -*- 2 3 from BaseHTTPServer import BaseHTTPRequestHandler 4 5 class ErrorHandler(BaseHTTPRequestHandler): 6 def do_GET(self): 7 self.send_error(404) 8 return 9 10 if __name__ == '__main__' 11 from BaseHTTPServer import HTTPServer 12 server = HTTPServer(('localhost',8080),ErrorHandler) 13 print 'Starting server, use <Ctrl+C> to stop' 14 server.serve_forever()
设置首部
send_header()方法将向HTTP响应添加首部数据。
1 # -*- coding: UTF-8 -*- 2 3 from BaseHTTPServer import BaseHTTPRequestHandler 4 import urlparse 5 import time 6 7 class GetHandler(BaseHTTPRequestHandler): 8 def do_GET(self): 9 self.send_response(200) 10 self.send_header('Last-Modified', 11 self.date_time_string(time.time())) 12 self.end_headers() 13 self.wfile.write('Response body \n') 14 return 15 16 if __name__ == '__main__': 17 from BaseHTTPServer import HTTPServer 18 server = HTTPServer(('localhost',8080),GetHandler) 19 print 'Starting server, use <Ctrl+C> to stop' 20 server.serve_forever()
0x03 urllib--网络资源访问
作用:访问不需要验证的远程资源/coocie等等。
利用缓存实现简单获取
urllib提供的urlretrieve()函数提供下载数据的功能。参数:1.URL 2.存放数据的一个临时文件和一个报告下载进度的函数。另外如果UTL指示一个表单,要求提交数据,那么urlretrieve()还有有一个参数表示要传递的数据。调用程序可以直接删除这个文件,或者将这个文件作为一个缓存,使用urlcleanup()将其删除。
使用一个HTTP GET请求从一个web服务器获取数据的例子:
1 # -*- coding: UTF-8 -*- 2 3 import urllib 4 import os 5 6 def reporthook(blocks_read, block_size, total_size): 7 """total_size is reported in bytes, 8 block_size is the amount read each time. 9 blocks_read is the number of blocks successfully read. 10 """ 11 if not blocks_read: 12 print 'Connection opened' 13 return 14 if total_size < 0: 15 #unknown size 16 print 'Read %d blocks (%d bytes)' % (blocks_read,blocks_read* block_size) 17 else: 18 amount_read = blocks_read * block_size 19 print 'Read %d blocks, or %d/%d' % (blocks_read,amount_read,total_size) 20 return 21 22 try: 23 filename,msg = urllib.urlretrieve('http://blog.doughellmann.com/', reporthook=reporthook) 24 25 print 26 print 'File:',filename 27 print 'Headers:' 28 print msg 29 print 'File exists before cleanup:', os.path.exists(filename) 30 31 finally: 32 urllib.urlcleanup() 33 print 'File still exists:', os.path.exists(filename)
参数编码
对参数编码并追加到URL,从而将它们传递到服务器。(error)
1 # -*- coding: UTF-8 -*- 2 3 import urllib 4 5 query_args = {'q':'query string','foo':'bar'} 6 encoded_args = urllib.urlencode(query_args) 7 print 'Encoded:', encoded_args 8 9 url = 'http://localhost:8080/?' + encoded_args 10 print urllib.urlopen(url).read()
要使用变量的不同出现向查询串传入一个值序列,需要在调用urlencode()时将doseq设置为True。
1 # -*- coding: UTF-8 -*- 2 3 import urllib 4 5 query_args = {'foo':['foo1','foo2']} 6 print 'Single :',urllib.urlencode(query_args) 7 print 'Sequence:',urllib.urlencode(query_args,doseq=True)
结果时一个查询串,同一个名称与多个值关联。
查询参数中可能有一些特殊字符,在服务器端对URL解析时这些字符会带来问题,所以在传递到urlencode()时要对这些特殊字符"加引号"。要在本地对特殊字符加引号从而得到字符串的“安全”版本。
直接使用quote()或quote_plus()函数。
1 # -*- coding: UTF-8 -*- 2 3 import urllib 4 5 url = 'http://localhost:8080/~dhellmann/' 6 print 'urlencode() :',urllib.urlencode({'url':url}) 7 print 'quote() :',urllib.quote(url) 8 print 'quote_plus():',urllib.quote_plus(url)
加引号的逆过程
相应的使用unquote()或unquote_plus()函数。
1 # -*- coding: UTF-8 -*- 2 3 import urllib 4 5 print urllib.unquote('http%3A//localhost%3A8080/%7Edhellmann/') 6 print urllib.unquote_plus('http%3A%2F%2Flocalhost%3A8080%2F%7Edhellmann%2F')
路径与URL
有些操作系统在本地文件和URL中使用不同的值分隔路径的不同部分。为了保证代码可移植,可以使用函数pathname2url()和url2pathname()来回转换。
1 # -*- coding: UTF-8 -*- 2 3 import os 4 from urllib import pathname2url,url2pathname 5 6 print '== Default ==' 7 path = '/a/b/c' 8 print 'Original:',path 9 print 'URL:',pathname2url(path) 10 print 'Path:',url2pathname('/d/e/f') 11 12 13 print '== Windows,without drive letter ==' 14 path = r'\a\b\c' 15 print 'Original:',path 16 print 'URL:',pathname2url(path) 17 print 'Path:',url2pathname('/d/e/f') 18 print 19 20 print '== Windows, with drive letter ==' 21 path = r'C:\a\b\c' 22 print 'Original:',path 23 print 'URL:',pathname2url(path) 24 print 'Path:',url2pathname('/d/e/f') 25 print
0x04 urllib2--网络资源访问
作用:用于打开扩展URL的库,这些URL可以通过定义定制协议处理器来扩展。
urllib2模块提供了一个更新的API来使用URL标识的Internet资源。
HTTP GET
...临场error
0x05 Base64--用ASCLL编码二进制数据
base64编码
1 # -*- coding: UTF-8 -*- 2 3 import base64 4 import textwrap 5 6 #load this sourse file and strip the header. 7 with open(__file__,'rt') as input: 8 raw = input.read() 9 initial_data = raw.split('#end_pymotw_headers')[1] 10 11 encoded_data = base64.b64encode(initial_data) 12 13 num_initial = len(initial_data) 14 15 #there will never be more than 2 padding bytes. 16 padding = 3 - (num_initial %3) 17 18 print '%d bytes before encoding' % num_initial 19 print 'Expect %d padding bytes' % padding 20 print '%d bytes after encoding' % len(encoded_data) 21 print 22 print encoded_data
base64解码
1 # -*- coding: UTF-8 -*- 2 3 import base64 4 5 original_string = 'this is the data, in the clear.' 6 print 'Original:' , original_string 7 encoded_string = base64.b64encode(original_string) 8 9 print 'Encoded:',encoded_string 10 11 decoded_string = base64.b64decode(encoded_string) 12 print 'Decoded:',decoded_string
URL安全的变种
因为默认的Base64字母表可能使用+和/,这两个字符在URL中会用到,所以通常很必要使用一个候选编码来替换这些字符。+替换成-,/替换成下划线_
1 # -*- coding: UTF-8 -*- 2 3 import base64 4 5 encodes_with_pluses = chr(251) + chr(239) 6 encodes_with_slashes = chr(255) * 2 7 8 for original in [encodes_with_pluses,encodes_with_slashes]: 9 print 'Original :',repr(original) 10 print 'Standard encodingL:',base64.standard_b64encode(original) 11 print 'UTL-safe encoding :',base64.urlsafe_b64encode(original) 12 print
其他编码
1 # -*- coding: UTF-8 -*- 2 3 import base64 4 5 original_string = 'This is the data,in the clear.' 6 print 'Original:', original_string 7 8 #Base32字母表包括ASCLL集中的26个大写字母以及数字2~7 9 encoded_string = base64.b32encode(original_string) 10 print 'Base32Encoded :', encoded_string 11 decoded_string = base64.b32decode(encoded_string) 12 print 'Base32Decoded :', decoded_string 13 14 #Base16函数处理十六进制字母表 15 encoded_string = base64.b16encode(original_string) 16 print 'Base16Encoded :',encoded_string 17 encoded_string = base64.b16decode(encoded_string) 18 print 'Base16Decoded :',encoded_string
0x06 robotparser--网络蜘蛛访问控制
作用:解析用于控制网络蜘蛛的robots.txt文件
1 # -*- coding: UTF-8 -*- 2 3 import robotparser 4 import urlparse 5 6 AFENT_NAME = 'PyMOTW' 7 URL_BASE = 'http://www.doughellmann.com/' 8 parser = robotparser.RobotFileParser() 9 parser.set_url(urlparse.urljoin(URL_BASE,'robots.txt')) 10 parser.read() 11 12 PATHS = [ 13 '/', 14 '/PyMOTW/', 15 '/admin/', 16 '/downloads/PyMOTW-1.92.tar.gz', 17 ] 18 19 for path in PATHS: 20 print '%6s : %s' % (parser.can_fetch(AFENT_NAME,path),path) 21 url = urlparse.urljoin(URL_BASE,path) 22 print '%6s : %s' % (parser.can_fetch(AFENT_NAME,url),url) 23 print
can_fetch()的URL参数可以是一个相对于网站根目录的相对路径,也可以是一个完全URL。
长久蜘蛛
如果一个应用需要花很长时间来处理它下载的资源,或者受到抑制,需要在很多次下载之间暂停,这样的移动应当以其下载内容的寿命为根据,定期检查新的robots.txt文件。这个寿命并不是自动管理的,不过模块提供了一些简便方法,利用这些方法可以更容易地跟踪文件的寿命。
1 # -*- coding: UTF-8 -*- 2 3 import robotparser 4 import urlparse 5 import time 6 7 AGENT_NAME = 'PyMOTW' 8 URL_BASE = 'http://www.doughellmann.com/' 9 parser = robotparser.RobotFileParser() 10 parser.set_url(urlparse.urljoin(URL_BASE,'robots.txt')) 11 parser.read() 12 parser.modified() 13 14 PATHS = [ 15 '/', 16 '/PyMOTW/', 17 '/admin/', 18 '/downloads/PyMOTW-1.92.tar.gz', 19 ] 20 21 for path in PATHS: 22 age = int(time.time() - parser.mtime()) 23 print 'age:',age, 24 if age>1: 25 print 'rereading robots.txt' 26 parser.read() 27 parser.modified() 28 else: 29 print 30 print '%6s : %s' % (parser.can_fetch(AGENT_NAME,path),path) 31 #Simulate delay in processing 32 time.sleep(1) 33 print
如果已下载的文件寿命超过了1秒,这个极端例子就会下载一个新的robots.txt文件。作为一个更好的长久应用,在下载整个文件之前可能会请求文件的修改世界。
0x07 Cookie--HTTP Cookie
创建和设置Cookie
1 # -*- coding: UTF-8 -*- 2 3 import Cookie 4 5 c = Cookie.SimpleCookie() 6 c['name'] = 'p0pl4r' 7 print c
输出是一个合法的Set-Cookie首部,可以作为HTTP响应的一部分传递到客户。
Morsel
cookie的所有RFC属性都可以通过表示cookie值的Morsel对象来管理,如到期时间/路径/域。
1 # -*- coding: UTF-8 -*- 2 3 import Cookie 4 import datetime 5 6 def show_cookie(c): 7 print c 8 for key,morsel in c.iteritems(): 9 print 10 print 'key=',morsel.key 11 print 'value=',morsel.value 12 print 'coded_value=',morsel.coded_value 13 for name in morsel.keys(): 14 if morsel[name]: 15 print '%s = %s' % (name,morsel[name]) 16 17 c = Cookie.SimpleCookie() 18 19 #A cookie with a value that has to be encoded to fit into the headers 20 c['encoded_value_cookie'] = '"cookie_value"' 21 c['encoded_value_cookie']['comment'] = 'this is cookie\'s comment' 22 23 #A cookie that only applies to part of a site 24 c['restricted_cookie'] = 'cookie_value' 25 c['restricted_cookie']['path'] = '/sub/path' 26 c['restricted_cookie']['domain'] = 'PyMOTW' 27 c['restricted_cookie']['secure'] = 'True' 28 29 #A cookie that expires in 5 minutes 30 c['with_max_age'] = 'expires in 5 minutes' 31 c['with_max_age']['max-age'] = 300 # seconds 32 33 #A cookie that expires at a specific time 34 c['expires_at_time'] = 'cookie_value' 35 time_to_live = datetime.timedelta(hours = 1) 36 expires = datetime.datetime(2018,9,19,18,30,14)+time_to_live 37 38 #Date format:Wdy,DD-Mon-YY HH:MM:SS: GMT 39 expires_at_time = expires.strftime('%a,%d %b %Y %H:%M:%S') 40 c['expires_at_time']['expires'] = expires_at_time 41 show_cookie(c)