crac

导航

no2.crossdomain.xml批量读取(待完善)

读取太多url有问题

#coding=utf-8 
import urllib
import requests
import sys
import re
import time


def getxml(url):
    xml = urllib.urlopen(url+'/crossdomain.xml')
    xmlread = xml.read() 
    reg = str(r'(?=domain=)(.*?)(?=/>)')
    #reg = str(r'<?xml*(.*?)</')
    reg = re.compile(reg)
    domaintxt = re.findall(reg,xmlread)
    #print domaintxt
    return domaintxt

f = open('xmlsource.txt','r')
f1 = open('reslut.txt','w')
#try:
context=list_of_all_the_lines = f.readlines( )
for i in context:
    #context:
    x = i.strip()
    print 'website:'+x+' have '+str(len(getxml(x)))+' domain:'
    print >>f1,'website:'+x+' have '+str(len(getxml(x)))+' domain:'
    #print context[i] +str(len(getxml(x)))
    xmllen = len(getxml(x))
    for m in range(0,xmllen,1):
        falresult = getxml(x)[m]
        falresult = falresult.replace('"','')
        falresult = falresult.replace('domain=','')
        print falresult
        print >>f1,falresult
    print ('\n')
    print >>f1,('\n')
    time.sleep(1)
print ('Over')
print >>f1,('Over')
f1.close()

 xml:

http://www.sina.com.cn/
http://www.discuz.net/
http://www.rising.com.cn/
http://www.ifeng.com//
http://www.sdo.com/
http://www.sogou.com/
http://www.163.com/

 

 

posted on 2016-05-02 00:31  crac  阅读(235)  评论(0编辑  收藏  举报