python BeautifulSoup 获取页面多个子节点中的各个节点的内容
页面html格式为
<tr bgcolor="#7bb5de">
<td style="border-bottom: 1px solid #C9D8AD" width="118" align="center" bgcolor="#D9E6FF">
<p align="center">
lyl5577d92</p></td>
<td style="border-bottom: 1px solid #C9D8AD" width="96" align="center" bgcolor="#D9E6FF">
<p align="center">李永利</p></td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="105" bgcolor="#D9E6FF">
<div align="center"><font color="#FF0000">lyl5577d</font></div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="153" bgcolor="#D9E6FF">
<div align="center">469680008</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="218" bgcolor="#D9E6FF">
<div align="center">2016-05-21 15:24:27.0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="171" bgcolor="#D9E6FF">
<div align="center">0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="119" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="111" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="87" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="128" bgcolor="#D9E6FF">0</td>
</tr>
1 import httplib 2 from BeautifulSoup import BeautifulSoup 3 4 5 def main(): 6 f = open('result','a') 7 8 headers = {'Content-Type':'application/x-www-form-urlencoded', 9 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 11 'Accept-Encoding': 'gzip, deflate', 12 'Referer': 'http://xxx.xxx.com/admin/userlist', 13 'Cookie': 'JSESSIONID=9F6F2D03D2C11400B3D6731E90D73117', 14 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:46.0) Gecko/20100101 Firefox/46.0', 15 } 16 17 conn = httplib.HTTPConnection('*.*.*.*', timeout=50) 18 19 for p in range(1,1287): 20 print p 21 conn.request(method='GET', 22 url="/admin/userlist?toPage=%s&sessionID=" % str(p), 23 headers=headers) 24 resp = conn.getresponse() 25 html_doc = resp.read() 26 mainSoup = BeautifulSoup(html_doc) 27 for s in mainSoup.findAll('tr', attrs={'bgcolor':'#7bb5de'}): 28 if 'style' not in str(s): 29 continue 30 for d in s.findAll('td'): 31 print d.getText(), 32 f.write("%s " % d.getText().encode('utf-8')) #f.write("%s " % d.getText())==> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128) 33 f.write("%s\n" % d.getText().encode('utf-8')) 34 print 35 f.close() 36 conn.close() 37 38 39 if __name__ == '__main__': 40 main() 41