如何解析复杂页面IP地址?

import re

import lxml.html

test_data = """
<head><style>
.HRAR{display:none}
.QMMO{display:none}
.DYWL{display:inline}
.KZGR{display:inline}
</style></head>
<body>抓取下面10个ip地址<br>
<span style="display:none">128</span>
<div style="display:none">54</div>
<span style="display:none">38</span>
<span class="QMMO">220</span>
<span style="display:none">.</span>
<span class="QMMO">107</span>
12
<span style="display:none">.</span>
<div style="display:none">99</div>
<span style="display:none">75</span>
<div style="display:none">.</div>
<span style="display:none">79</span>
<span class="QMMO">.</span>
<span style="display:none">.</span>
.
<span style="display:inline">82</span>
<div style="display:none">196</div>
<span style="display:inline">.</span>
<span style="display:none">74</span>
<span class="QMMO">179</span>
141
<span style="display:none">.</span>
.
<span style="display:none">.</span>
<span style="display:none">180</span>
<div style="display:none">162</div>
<span class="NUMH">45</span>
<br>
<div style="display:none">196</div>
<span class="HRAR">.</span>
<div style="display:none">119</div>
<div style="display:none">157</div>
<span class="QMMO">188</span>
<span class="HRAR">222</span>
<span class="HRAR">.</span>
<span class="QMMO">37</span>
<div style="display:none">.</div>
<span class="NUMH">165</span>
<span style="display:none">25</span>
<span class="HRAR">79</span>
<div style="display:none">154</div>
<span style="display:none">.</span>
<div style="display:none">11</div>
<span class="HRAR">61</span>
.
<span class="EIQR">239</span>
<div style="display:none">102</div>
<span style="display:none">.</span>
<span style="display:none">.</span>
<div style="display:none">41</div>
<div style="display:none">193</div>
<span style="display:inline">.</span>
233
.
110
<br>
<span class="QMMO">.</span>
<span style="display:none">3</span>
<span style="display:none">.</span>
140
<span class="QMMO">127</span>
.
<span style="display:none">.</span>
<div style="display:none">202</div>
<span class="DYWL">7</span>
<span style="display:none">148</span>
<span class="HRAR">219</span>
<div style="display:none">.</div>
<span class="QMMO">.</span>
<div style="display:none">.</div>
<div style="display:none">.</div>
<div style="display:none">136</div>
<span class="QMMO">230</span>
<div style="display:none">183</div>
.
<span style="display:none">242</span>
<span style="display:none">.</span>
<span class="QMMO">57</span>
<span style="display:none">.</span>
<span style="display:none">.</span>
<span style="display:inline">190</span>
<span class="EIQR">.</span>
5
<br>
<div style="display:none">.</div>
<span class="HRAR">250</span>
<div style="display:none">179</div>
<div style="display:none">106</div>
<span style="display:none">18</span>
<span class="YMXL">151</span>
<span style="display:none">.</span>
<div style="display:none">73</div>
<span class="HRAR">91</span>
<span class="DYWL">.</span>
<span class="HRAR">201</span>
<span style="display:none">.</span>
<span class="QMMO">.</span>
<span style="display:none">.</span>
<div style="display:none">86</div>
<span style="display:inline">39</span>
<span style="display:none">.</span>
<span style="display:none">.</span>
<span class="HRAR">85</span>
<span class="QMMO">215</span>
<span class="QMMO">.</span>
<span class="HRAR">232</span>
<span class="YMXL">.</span>
<div style="display:none">234</div>
<span style="display:inline">243</span>
<span style="display:inline">.</span>
<span style="display:inline">210</span>
<br>
<span style="display:none">.</span>
<span class="HRAR">.</span>
<span class="QMMO">185</span>
<div style="display:none">119</div>
<span class="HRAR">51</span>
<span class="HRAR">90</span>
<span class="QMMO">229</span>
<span class="BOZY">64</span>
<span style="display:none">256</span>
<span class="HRAR">.</span>
<span class="HRAR">207</span>
<span class="HRAR">99</span>
<span style="display:none">177</span>
<span class="HRAR">161</span>
<div style="display:none">55</div>
<span style="display:none">.</span>
<span class="HRAR">252</span>
<div style="display:none">.</div>
<div style="display:none">106</div>
<span class="HRAR">189</span>
<span class="HRAR">12</span>
.
96
.
<span class="GNTR">36</span>
<span style="display:inline">.</span>
<span class="GNTR">50</span>
<br>
<span style="display:none">211</span>
52
<span style="display:none">158</span>
<span class="HRAR">.</span>
<span class="HRAR">167</span>
<span style="display:none">209</span>
<span style="display:none">57</span>
<span class="HRAR">24</span>
<span style="display:none">.</span>
<span class="QMMO">143</span>
.
<span style="display:none">57</span>
<div style="display:none">.</div>
<span class="HRAR">23</span>
<div style="display:none">.</div>
156
<span style="display:none">29</span>
<span class="GNTR">.</span>
<div style="display:none">80</div>
<span class="QMMO">.</span>
<span style="display:inline">142</span>
<span class="HRAR">.</span>
<div style="display:none">.</div>
<span class="HRAR">248</span>
<span style="display:none">.</span>
.
<span class="DYWL">254</span>
<br>
<span style="display:none">.</span>
<span style="display:none">26</span>
<div style="display:none">164</div>
<div style="display:none">.</div>
<span style="display:none">.</span>
<div style="display:none">102</div>
<span style="display:none">.</span>
<span style="display:none">96</span>
<span class="QGZL">153</span>
<span class="HRAR">229</span>
<span class="QMMO">85</span>
<span style="display:none">130</span>
<div style="display:none">114</div>
.
<span style="display:inline">4</span>
<span style="display:inline">.</span>
<span class="YMXL">162</span>
<span style="display:none">.</span>
<span class="HRAR">232</span>
<div style="display:none">226</div>
<span class="QMMO">.</span>
<span class="HRAR">.</span>
<span style="display:none">142</span>
<div style="display:none">46</div>
.
52
<span class="HRAR">203</span>
<br>
<span style="display:none">.</span>
<span class="QMMO">33</span>
<div style="display:none">29</div>
232
<span class="QMMO">.</span>
<div style="display:none">85</div>
<span class="QMMO">69</span>
<span style="display:none">245</span>
<span class="HRAR">.</span>
<div style="display:none">169</div>
<span style="display:none">199</span>
<span class="HRAR">23</span>
<span style="display:none">.</span>
<span class="QMMO">.</span>
<div style="display:none">88</div>
<span style="display:none">10</span>
<span class="QGZL">.</span>
<span class="QMMO">.</span>
<div style="display:none">.</div>
<div style="display:none">.</div>
<div style="display:none">240</div>
<span style="display:none">245</span>
<span class="YMXL">10</span>
.
<span style="display:inline">72</span>
<span class="BOZY">.</span>
<span class="KZGR">169</span>
<br>
<div style="display:none">206</div>
<div style="display:none">239</div>
<span class="HRAR">218</span>
<div style="display:none">97</div>
<span class="HRAR">106</span>
<span class="QMMO">.</span>
<span class="QMMO">140</span>
<span style="display:none">144</span>
<span class="HRAR">126</span>
<div style="display:none">.</div>
127
<div style="display:none">.</div>
<span style="display:none">120</span>
<span style="display:none">209</span>
<span class="BOZY">.</span>
<span style="display:none">179</span>
<span class="HRAR">.</span>
<span style="display:inline">3</span>
<div style="display:none">.</div>
<span class="QMMO">198</span>
<div style="display:none">169</div>
<span style="display:none">.</span>
<span style="display:none">37</span>
<span class="EIQR">.</span>
<span style="display:inline">31</span>
<span style="display:inline">.</span>
61
<br>
<div style="display:none">37</div>
76
<span style="display:none">94</span>
<span style="display:none">.</span>
.
<span class="HRAR">109</span>
<span style="display:inline">17</span>
<div style="display:none">.</div>
<span class="HRAR">232</span>
<span class="HRAR">247</span>
<span class="HRAR">136</span>
<span class="HRAR">67</span>
<span class="HRAR">49</span>
<div style="display:none">194</div>
<span class="QGZL">.</span>
<span class="QMMO">159</span>
<span class="QMMO">.</span>
<div style="display:none">81</div>
<span style="display:inline">39</span>
<span style="display:none">29</span>
<span style="display:inline">.</span>
<span style="display:none">202</span>
30
<div style="display:none">89</div>
<span class="HRAR">242</span>
<span style="display:none">138</span>
<span class="HRAR">62</span><body>

        """
'''
/ 从根标签开始
// 从当前标签开始 后续节点含有即可选出
*通配符选择所有
//div/book[1]/title 选择div 下第一个book标签的title元素
//div/book/title[@lang="zh"]选择title属性含有lang且内容zh的title元素
//div/book/title  //book/title //title具有相同的結果,因为使用相対路径最終都指向title
//book/title/@* 将title所有属性值选择出来
//book/title/text() 将title的内容选择出来,使用内置text函数
//a[@href="link1.html" and @id="places_neighbours_row"
//div/book[last()]/title/text()#将最后一个book元素选出来
//div/book[price>39]/title/text()
//li[starts-with(@class,'item')]/a/text()
'''

#解析
def analysis_content(test_data):
    """
    解析文件,得到ip
    :param result:
    :return:
    re 模块的一般使用步骤如下:
    使用 compile() 函数将正则表达式的字符串形式编译为一个 Pattern 对象
    通过 Pattern 对象提供的一系列方法对文本进行匹配查找,获得匹配结果,一个 Match 对象。
    最后使用 Match 对象提供的属性和方法获得信息,根据需要进行其他的操作

Pattern 对象的一些常用方法主要有:
    match 方法:从起始位置开始查找,一次匹配
    search 方法:从任何位置开始查找,一次匹配
    findall 方法:全部匹配,返回列表
    finditer 方法:全部匹配,返回迭代器
    split 方法:分割字符串,返回列表
    sub 方法:替换
    """
    pattern = re.compile(r'\.([A-Z]+){display:none}')#匹配大写字母,([A-Z]+)组
    class_none_list = pattern.findall(test_data)#全部匹配,返回列表
    # print(class_none_list)#['HRAR', 'QMMO']{display:none}
    pattern_div = re.compile('<div\s.*')
    t = pattern_div.sub("", test_data)
    pattern_span_none = re.compile('<span\sstyle="display:none">.*?</span>')
    t1 = pattern_span_none.sub("", t)
    pattern_class_none1 = re.compile('<span\sclass="' + class_none_list[0] + '">.*</span>')
    t2 = pattern_class_none1.sub("", t1)
    pattern_class_none2 = re.compile('<span\sclass="' + class_none_list[1] + '">.*</span>')
    t3 = pattern_class_none2.sub("", t2)
    html = lxml.html.fromstring(t3.replace("\n", ""))
    html_data = html.xpath('//body/descendant-or-self::text()')
    tt = ""
    lt = []
    for i in html_data[1:]:
        if tt.count('.') == 3 and tt[-1] != '.':
            lt.append(tt)
            tt = ""
        tt = tt + i
    lt.append(tt)
    print(lt)#打印IP
    print(len(lt))#打印列表长度


analysis_content(test_data)
输出结果:['12.82.141.45', '165.239.233.110', '140.7.190.5', '151.39.243.210', '64.96.36.50', '52.156.142.254', '153.4.162.52', '232.10.72.169', '127.3.31.61', '76.17.39.30        ']
10

 

posted @ 2018-12-25 09:49  青春叛逆者  阅读(391)  评论(0编辑  收藏  举报