Beautifulsoup4学习文档解析流程图 - kmingspirit

公告

url = http://zst.aicai.com/ssq/openInfo/

体育彩票开奖信息：一种思路是正则Html,另一种相当于一个框架xml解析html. 两种方法没有优缺点，不能说那个方便，那个代码少就是容易。有精力还是要有正则扎实的基础才好。

import urllib.request
import urllib.parse
import re
import urllib.request,urllib.parse,http.cookiejar

def getHtml(url):
    cj=http.cookiejar.CookieJar()
    opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),('Cookie','4564564564564564565646540')]

    urllib.request.install_opener(opener)

    html_bytes = urllib.request.urlopen( url ).read()
    html_string = html_bytes.decode( 'utf-8' )
    return html_string

#url = http://zst.aicai.com/ssq/openInfo/
#最终输出结果格式如：2015075期开奖号码：6,11,13,19,21,32, 蓝球：4
html = getHtml("http://zst.aicai.com/ssq/openInfo/")
#<table class="fzTab nbt"> </table>

table = html[html.find('<table class="fzTab nbt">') : html.find('</table>')]
#print (table)
#<tr onmouseout="this.style.background=''" onmouseover="this.style.background='#fff7d8'">
#<tr \r\n\t\t                  onmouseout=
tmp = table.split('<tr \r\n\t\t                  onmouseout=',1)
#print(tmp)
#print(len(tmp))
trs = tmp[1]
tr = trs[: trs.find('</tr>')]
#print(tr)
number = tr.split('<td   >')[1].split('</td>')[0]
print(number + '期开奖号码：',end='')
redtmp = tr.split('<td  class="redColor sz12" >')
reds = redtmp[1:len(redtmp)-1]#去掉第一个和最后一个没用的元素
#print(reds)
for redstr in reds:
    print(redstr.split('</td>')[0] + ",",end='')
print('蓝球：',end='')
blue = tr.split('<td  class="blueColor sz12" >')[1].split('</td>')[0]
print(blue)

View Code

from bs4 import BeautifulSoup

import urllib.request
import urllib.parse
import urllib.request,http.cookiejar

def getHtml(url):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-Agent',
                          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
                         ('Cookie', '4564564564564564565646540')]

    urllib.request.install_opener(opener)

    html_bytes = urllib.request.urlopen(url).read()
    html_string = html_bytes.decode('utf-8')
    return html_string

html_doc = getHtml("http://zst.aicai.com/ssq/openInfo/")
soup = BeautifulSoup(html_doc, 'html.parser')

# print(soup.title)
#table = soup.find_all('table', class_='fzTab')
#print(table)#<tr onmouseout="this.style.background=''" 这种tr丢失了

tr = soup.find('tr',attrs={"onmouseout": "this.style.background=''"})
#print(tr)
tds = tr.find_all('td')
opennum = tds[0].get_text()
#print(opennum)

reds = []
for i in  range(2,8):
    reds.append(tds[i].get_text())
#print(reds)
blue = tds[8].get_text()
#print(blue)

#把list转换为字符串:(',').join(list)
#最终输出结果格式如：2015075期开奖号码：6,11,13,19,21,32, 蓝球：4
print(opennum+'期开奖号码：'+ (',').join(reds)+", 蓝球："+blue)

urllib.parse

　故名思义，URL parsng ,url解析的意思。

URL解析功能集中在将URL字符串分割到其组件中，或将URL组件合并到URL字符串中。

1 urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)

from bs4 import BeautifulSoup

import urllib.request
import urllib.parse
import urllib.request,http.cookiejar
import requests
url = "http://zst.aicai.com/ssq/openInfo/"
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')

# print(soup.title)
#table = soup.find_all('table', class_='fzTab')
#print(table)#<tr onmouseout="this.style.background=''" 这种tr丢失了

tr = soup.find('tr',attrs={"onmouseout": "this.style.background=''"})
#print(tr)
tds = tr.find_all('td')
opennum = tds[0].get_text()
#print(opennum)

reds = []
for i in  range(2,8):
    reds.append(tds[i].get_text())
#print(reds)
blue = tds[8].get_text()
#print(blue)

#把list转换为字符串:(',').join(list)
#最终输出结果格式如：2015075期开奖号码：6,11,13,19,21,32, 蓝球：4
print(opennum+'期开奖号码：'+ (',').join(reds)+", 蓝球："+blue)

http://docs.python-requests.org/zh_CN/latest/user/quickstart.html

我感觉写getHtml()方法的人士java出身-。- , 也可以用上述，不用函数方法。

 1 __author__ = 'Kming'
 2 
 3 from bs4 import BeautifulSoup
 4 import requests
 5 url = "http://www.baidu.com"
 6 res = requests.get(url)
 7 res.encoding='utf-8'
 8 soup = BeautifulSoup(res.text, 'html.parser')
 9 
10 for link in soup.find_all('a'):
11     print(link.get('href'))

posted on 2017-08-02 19:10 kmingspirit 阅读(899) 评论(0) 编辑收藏举报

刷新页面返回顶部