Internet

 

 

0x01 URL的解析/反解析/连接

解析

urlparse()--分解URL

# -*- coding: UTF-8 -*-

from urlparse import urlparse

url = 'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag'
parsed = urlparse(url)
print parsed

print parsed.scheme
print parsed.netloc
print parsed.path
print parsed.params
print parsed.query
print parsed.fragment
print parsed.username
print parsed.password
print parsed.hostname,'(netloc in lowercase)'
print parsed.port
View Code

 

urlsplit()--替换urlparse(),但不会分解参数。(没有params属性)

# -*- coding: UTF-8 -*-

from urlparse import urlsplit

url = 'http://user:pwd@NetLoc:80/p1;param/p2?query=arg#frag'
parsed = urlsplit(url)
print parsed

print parsed.scheme
print parsed.netloc
print parsed.path
print parsed.params
print parsed.query
print parsed.fragment
print parsed.username
print parsed.password
print parsed.hostname,'(netloc in lowercase)'
print parsed.port
urlsplit()示例

 

urldefrag()--从URL中剥离出片段标识符

# -*- coding: UTF-8 -*-

from urlparse import urldefrag

url = 'http://NetLoc/path;param?query=arg#frag'

print 'original :',url
url,fragment = urldefrag(url)
print 'url :',url
print 'fragment :',fragment
urldefrag()示例

 

反解析

 geturl()--只适用于urlparse()或urlsplit()返回的对象

# -*- coding: UTF-8 -*-

from urlparse import urlparse

url = 'http://NetLoc/path;param?query=arg#frag'

print 'original :',url
parsed = urlparse(url)

print 'after geturl() :',parsed.geturl()
urlsplit()示例

 

 

urlunparse()--将包含串的普通元组拼接成一个URL(如果输入URL包含多余部分,重新构造的URL可能会将其去除)

# -*- coding: UTF-8 -*-

from urlparse import urlparse,urlunparse

url = 'http://NetLoc/path;param?query=arg#frag'

print 'ORIGINAL URL:',url
parsed = urlparse(url)
print 'PARSED :',type(parsed),parsed
t = parsed[:]
print 'TUPLE :',type(t),t
print 'NEW :',urlunparse(t)
urlunparse()示例

 

 

连接

 urljoin()--由相对片段构造绝对URL

# -*- coding: UTF-8 -*-

from urlparse import urljoin

print urljoin('http://www.example.com/path/file.html','anotherfile.html')
print urljoin('http://www.example.com/path/file.html','../anotherfile.html')
urljoin()相对路径示例

# -*- coding: UTF-8 -*-

from urlparse import urljoin

print urljoin('http://www.example.com/path/','/subpath/file.html')
print urljoin('http://www.example.com/path/','subpath/file.html')
urljoin()非相对路径示例

注:如果连接到URL的路径以斜线开头(/),这会将URL的路径重置为顶级路径。如果不是以一个斜线开头,则追加到当前URL路径的末尾。

 

 

0x02  BaseHTTPServer--实现web服务器的基类

 HTTP GET

下面一个示例展示了一个请求处理器如何向客户返回一个响应

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import urlparse
 5 
 6 class GetHandler (BaseHTTPRequestHandler):
 7     def do_GET(self):
 8         parsed_path = urlparse.urlparse(self.path)
 9         message_parts = [
10                 'CLIENT VALUES:',
11                 'client_address=%s (%s)' % (self.client_address,
12                                             self.address_string()),
13                 'command=%s' % self.command,
14                 'path=%s' % self.path,
15                 'real_path=%s' % parsed_path.path,
16                 'query=%s' % self.request_version,
17                 '',
18                 'SERVER VALUES:',
19                 'server_version=%s' % self.server_version,
20                 'sys_version=%s' % self.sys_version,
21                 'protocol_version=%s' % self.protocol_version,
22                 '',
23                 'HEADERS RECEIVED:',
24             ]
25         for name,value in sorted(self.headers.items()):
26             message_parts.append('%s=%s' % (name,value.rstrip()))
27         message_parts.append('')
28         message = '\r\n'.join(message_parts)
29         self.send_response(200)
30         self.end_headers()
31         self.wfile.write(message)
32         return
33         
34 if __name__ == '__main__':
35     from BaseHTTPServer import HTTPServer
36     server = HTTPServer(('localhost',8080),GetHandler)
37     print 'Starting server,use <Ctrl+C> to stop'
38     server.serve_forever()
View Code

 

HTTP POST

支持POST请求需要多做一些工作,因为基类不会自动解析表单数据。cgi模块提供了FieldStorage类,如果给定了正确的输入,它知道如何解析表单。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import cgi
 5 
 6 
 7 class PostHandler(BaseHTTPRequestHandler):
 8     def do_POST(self):
 9         # parse the form data posted
10         form = cgi.FieldStorage(
11             fp=self.rfile,
12             headers=self.headers,
13             environ={'REQUEST_METHOD': 'POST',
14                      'CONTENT_TYPE': self.headers['Content-Type'],
15                      })
16 
17         # begin the response
18         self.send_response(200)
19         self.end_headers()
20         self.wfile.write('Client:%s\n' % str(self.client_address))
21         self.wfile.write('User-agent:%s\n' % str(self.headers['user-agent']))
22         self.wfile.write('Path:%s\n' % self.path)
23         self.wfile.write('Form data:\n')
24 
25         # Echo back information about what was posted in the form
26         for field in form.keys():
27             field_item = form[field]
28             if field_item.filename:
29                 # the field contains an uploaded file
30                 file_data = field_item.file.read()
31                 file_len = len(file_data)
32                 del file_data
33                 self.wfile.write(
34                     '\tUpload %s as "%s" (%d bytes)\n' % (field, field_item.filename, file_len))
35             else:
36                 # regular form values
37                 self.wfile.write('\t%s=%s\n' % (field, form[field].value))
38 
39         return
40 
41 
42 if __name__ == '__main__':
43     from BaseHTTPServer import HTTPServer
44     server = HTTPServer(('localhost', 8080), PostHandler)
45     print 'starting sever,use <Ctrl+Z> to stop'
46     server.serve_forever()
View Code

 

 

线程与进程

 HTTPServer是SocketServer.TCPServer的一个子类,不使用多线程或者多进程来处理请求。要增加线程或进程,需要使用相应的mix-in技术从SocketServer创建一个新类。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
 4 from SocketServer import ThreadingMixIn
 5 import threading
 6 
 7 class Handler(BaseHTTPRequestHandler):
 8     def do_GET(self):
 9         self.send_response(200)
10         self.end_headers()
11         message=threading.currentThread().getName()
12         self.wfile.write(message)
13         self.wfile.write('\n')
14         return
15 
16 class ThreadedHTTPServer(ThreadingMixIn,HTTPServer):
17     """Handler requests in a separate thread."""
18     
19 if __name__ == '__main__':
20     server = ThreadedHTTPServer(('localhost',8080),Handler)
21     print 'Starting server, use <Ctrl+C> to stop'
22     server.serve_forever()
View Code

 

处理错误

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 
 5 class ErrorHandler(BaseHTTPRequestHandler):
 6     def do_GET(self):
 7         self.send_error(404)
 8         return
 9 
10 if __name__ == '__main__'
11     from BaseHTTPServer import HTTPServer
12     server = HTTPServer(('localhost',8080),ErrorHandler)
13     print 'Starting server, use <Ctrl+C> to stop'
14     server.serve_forever()
View Code

 

设置首部

send_header()方法将向HTTP响应添加首部数据。

 1 # -*- coding: UTF-8 -*-
 2 
 3 from BaseHTTPServer import BaseHTTPRequestHandler
 4 import urlparse
 5 import time
 6 
 7 class GetHandler(BaseHTTPRequestHandler):
 8     def do_GET(self):
 9         self.send_response(200)
10         self.send_header('Last-Modified',
11                           self.date_time_string(time.time()))
12         self.end_headers()
13         self.wfile.write('Response body \n')
14         return
15 
16 if __name__ == '__main__':
17     from BaseHTTPServer import HTTPServer
18     server = HTTPServer(('localhost',8080),GetHandler)
19     print 'Starting server, use <Ctrl+C> to stop'
20     server.serve_forever()
send_header()

 

 

0x03  urllib--网络资源访问

作用:访问不需要验证的远程资源/coocie等等。

利用缓存实现简单获取

 urllib提供的urlretrieve()函数提供下载数据的功能。参数:1.URL 2.存放数据的一个临时文件和一个报告下载进度的函数。另外如果UTL指示一个表单,要求提交数据,那么urlretrieve()还有有一个参数表示要传递的数据。调用程序可以直接删除这个文件,或者将这个文件作为一个缓存,使用urlcleanup()将其删除。

使用一个HTTP GET请求从一个web服务器获取数据的例子:

 1 # -*- coding: UTF-8 -*-
 2 
 3 import urllib
 4 import os
 5 
 6 def reporthook(blocks_read, block_size, total_size):
 7     """total_size is reported in bytes,
 8     block_size is the amount read each time.
 9     blocks_read is the number of blocks successfully read.
10     """
11     if not blocks_read:
12         print 'Connection opened'
13         return
14     if total_size < 0:
15         #unknown size
16         print 'Read %d blocks (%d bytes)' % (blocks_read,blocks_read* block_size)
17     else:
18         amount_read = blocks_read * block_size
19         print 'Read %d blocks, or %d/%d' % (blocks_read,amount_read,total_size)
20     return
21 
22 try:
23     filename,msg = urllib.urlretrieve('http://blog.doughellmann.com/', reporthook=reporthook)
24     
25     print
26     print 'File:',filename
27     print 'Headers:'
28     print msg
29     print 'File exists before cleanup:', os.path.exists(filename)
30     
31 finally:
32     urllib.urlcleanup()
33     print 'File still exists:', os.path.exists(filename)
View Code

 

参数编码

对参数编码并追加到URL,从而将它们传递到服务器。(error)

 1 # -*- coding: UTF-8 -*-
 2 
 3 import urllib
 4 
 5 query_args = {'q':'query string','foo':'bar'}
 6 encoded_args = urllib.urlencode(query_args)
 7 print 'Encoded:', encoded_args
 8 
 9 url = 'http://localhost:8080/?' + encoded_args
10 print urllib.urlopen(url).read()
View Code

要使用变量的不同出现向查询串传入一个值序列,需要在调用urlencode()时将doseq设置为True。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 query_args = {'foo':['foo1','foo2']}
6 print 'Single :',urllib.urlencode(query_args)
7 print 'Sequence:',urllib.urlencode(query_args,doseq=True)
View Code

结果时一个查询串,同一个名称与多个值关联。

查询参数中可能有一些特殊字符,在服务器端对URL解析时这些字符会带来问题,所以在传递到urlencode()时要对这些特殊字符"加引号"。要在本地对特殊字符加引号从而得到字符串的“安全”版本。

直接使用quote()或quote_plus()函数。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 url = 'http://localhost:8080/~dhellmann/'
6 print 'urlencode() :',urllib.urlencode({'url':url})
7 print 'quote()     :',urllib.quote(url)
8 print 'quote_plus():',urllib.quote_plus(url)
View Code

加引号的逆过程

相应的使用unquote()或unquote_plus()函数。

1 # -*- coding: UTF-8 -*-
2 
3 import urllib
4 
5 print urllib.unquote('http%3A//localhost%3A8080/%7Edhellmann/')
6 print urllib.unquote_plus('http%3A%2F%2Flocalhost%3A8080%2F%7Edhellmann%2F')
View Code

 

路径与URL

有些操作系统在本地文件和URL中使用不同的值分隔路径的不同部分。为了保证代码可移植,可以使用函数pathname2url()和url2pathname()来回转换。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import os
 4 from urllib import pathname2url,url2pathname
 5 
 6 print '== Default =='
 7 path = '/a/b/c'
 8 print 'Original:',path
 9 print 'URL:',pathname2url(path)
10 print 'Path:',url2pathname('/d/e/f')
11 
12 
13 print '== Windows,without drive letter =='
14 path = r'\a\b\c'
15 print 'Original:',path
16 print 'URL:',pathname2url(path)
17 print 'Path:',url2pathname('/d/e/f')
18 print 
19 
20 print '== Windows, with drive letter =='
21 path = r'C:\a\b\c'
22 print 'Original:',path
23 print 'URL:',pathname2url(path)
24 print 'Path:',url2pathname('/d/e/f')
25 print 
View Code

 

0x04  urllib2--网络资源访问

 作用:用于打开扩展URL的库,这些URL可以通过定义定制协议处理器来扩展。

urllib2模块提供了一个更新的API来使用URL标识的Internet资源。

HTTP GET

...临场error

 

0x05  Base64--用ASCLL编码二进制数据

base64编码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 import textwrap
 5 
 6 #load this sourse file and strip the header.
 7 with open(__file__,'rt') as input:
 8     raw = input.read()
 9     initial_data = raw.split('#end_pymotw_headers')[1]
10     
11 encoded_data = base64.b64encode(initial_data)
12 
13 num_initial = len(initial_data)
14 
15 #there will never be more than 2 padding bytes.
16 padding = 3 - (num_initial %3)
17 
18 print '%d bytes before encoding' % num_initial
19 print 'Expect %d padding bytes' % padding
20 print '%d bytes after encoding' % len(encoded_data)
21 print 
22 print encoded_data
View Code

 

base64解码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 original_string = 'this is the data, in the clear.'
 6 print 'Original:' , original_string
 7 encoded_string = base64.b64encode(original_string)
 8 
 9 print 'Encoded:',encoded_string
10 
11 decoded_string = base64.b64decode(encoded_string)
12 print 'Decoded:',decoded_string
View Code

 

URL安全的变种

因为默认的Base64字母表可能使用+和/,这两个字符在URL中会用到,所以通常很必要使用一个候选编码来替换这些字符。+替换成-,/替换成下划线_

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 encodes_with_pluses = chr(251) + chr(239)
 6 encodes_with_slashes = chr(255) * 2
 7 
 8 for original in [encodes_with_pluses,encodes_with_slashes]:
 9     print 'Original          :',repr(original)
10     print 'Standard encodingL:',base64.standard_b64encode(original)
11     print 'UTL-safe encoding :',base64.urlsafe_b64encode(original)
12     print
View Code

 

其他编码

 1 # -*- coding: UTF-8 -*-
 2 
 3 import base64
 4 
 5 original_string = 'This is the data,in the clear.'
 6 print 'Original:', original_string
 7 
 8 #Base32字母表包括ASCLL集中的26个大写字母以及数字2~7
 9 encoded_string = base64.b32encode(original_string)
10 print 'Base32Encoded :', encoded_string
11 decoded_string = base64.b32decode(encoded_string)
12 print 'Base32Decoded :', decoded_string
13 
14 #Base16函数处理十六进制字母表
15 encoded_string = base64.b16encode(original_string)
16 print 'Base16Encoded :',encoded_string
17 encoded_string = base64.b16decode(encoded_string)
18 print 'Base16Decoded :',encoded_string
View Code

 

0x06  robotparser--网络蜘蛛访问控制

作用:解析用于控制网络蜘蛛的robots.txt文件

 1 # -*- coding: UTF-8 -*-
 2 
 3 import robotparser
 4 import urlparse
 5 
 6 AFENT_NAME = 'PyMOTW'
 7 URL_BASE = 'http://www.doughellmann.com/'
 8 parser = robotparser.RobotFileParser()
 9 parser.set_url(urlparse.urljoin(URL_BASE,'robots.txt'))
10 parser.read()
11 
12 PATHS = [
13     '/',
14     '/PyMOTW/',
15     '/admin/',
16     '/downloads/PyMOTW-1.92.tar.gz',
17     ]
18     
19 for path in PATHS:
20     print '%6s : %s' % (parser.can_fetch(AFENT_NAME,path),path)
21     url = urlparse.urljoin(URL_BASE,path)
22     print '%6s : %s' % (parser.can_fetch(AFENT_NAME,url),url)
23     print
View Code

can_fetch()的URL参数可以是一个相对于网站根目录的相对路径,也可以是一个完全URL。

 

长久蜘蛛

如果一个应用需要花很长时间来处理它下载的资源,或者受到抑制,需要在很多次下载之间暂停,这样的移动应当以其下载内容的寿命为根据,定期检查新的robots.txt文件。这个寿命并不是自动管理的,不过模块提供了一些简便方法,利用这些方法可以更容易地跟踪文件的寿命。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import robotparser
 4 import urlparse
 5 import time
 6 
 7 AGENT_NAME = 'PyMOTW'
 8 URL_BASE = 'http://www.doughellmann.com/'
 9 parser = robotparser.RobotFileParser()
10 parser.set_url(urlparse.urljoin(URL_BASE,'robots.txt'))
11 parser.read()
12 parser.modified()
13 
14 PATHS = [
15     '/',
16     '/PyMOTW/',
17     '/admin/',
18     '/downloads/PyMOTW-1.92.tar.gz',
19     ]
20 
21 for path in PATHS:
22     age = int(time.time() - parser.mtime())
23     print 'age:',age,
24     if age>1:
25         print 'rereading robots.txt'
26         parser.read()
27         parser.modified()
28     else:
29         print
30     print '%6s : %s' % (parser.can_fetch(AGENT_NAME,path),path)
31     #Simulate delay in processing
32     time.sleep(1)
33     print
View Code

如果已下载的文件寿命超过了1秒,这个极端例子就会下载一个新的robots.txt文件。作为一个更好的长久应用,在下载整个文件之前可能会请求文件的修改世界。

 

0x07  Cookie--HTTP Cookie

创建和设置Cookie

1 # -*- coding: UTF-8 -*-
2 
3 import Cookie
4 
5 c = Cookie.SimpleCookie()
6 c['name'] = 'p0pl4r'
7 print c
View Code

输出是一个合法的Set-Cookie首部,可以作为HTTP响应的一部分传递到客户。

 

Morsel

cookie的所有RFC属性都可以通过表示cookie值的Morsel对象来管理,如到期时间/路径/域。

 1 # -*- coding: UTF-8 -*-
 2 
 3 import Cookie
 4 import datetime
 5 
 6 def show_cookie(c):
 7     print c
 8     for key,morsel in c.iteritems():
 9         print
10         print 'key=',morsel.key
11         print 'value=',morsel.value
12         print 'coded_value=',morsel.coded_value
13         for name in morsel.keys():
14             if morsel[name]:
15                 print '%s = %s' % (name,morsel[name])
16                 
17 c = Cookie.SimpleCookie()
18 
19 #A cookie with a value that has to be encoded to fit into the headers
20 c['encoded_value_cookie'] = '"cookie_value"'
21 c['encoded_value_cookie']['comment'] = 'this is cookie\'s comment'
22 
23 #A cookie that only applies to part of a site
24 c['restricted_cookie'] = 'cookie_value'
25 c['restricted_cookie']['path'] = '/sub/path'
26 c['restricted_cookie']['domain'] = 'PyMOTW'
27 c['restricted_cookie']['secure'] = 'True'
28 
29 #A cookie that expires in 5 minutes
30 c['with_max_age'] = 'expires in 5 minutes'
31 c['with_max_age']['max-age'] = 300 # seconds
32 
33 #A cookie that expires at a specific time
34 c['expires_at_time'] = 'cookie_value'
35 time_to_live = datetime.timedelta(hours = 1)
36 expires = datetime.datetime(2018,9,19,18,30,14)+time_to_live
37 
38 #Date format:Wdy,DD-Mon-YY HH:MM:SS: GMT
39 expires_at_time = expires.strftime('%a,%d %b %Y %H:%M:%S')
40 c['expires_at_time']['expires'] = expires_at_time
41 show_cookie(c)
View Code

 

posted @ 2019-03-17 14:58  p0pl4r  阅读(221)  评论(0编辑  收藏  举报