import urllib
import urllib2
import re
def getDetailUrl(name):
reg = r'<a href="(.*?)" target="_blank">(.*?)</a>'
pattern = re.compile(reg, re.I)
match = re.search(pattern, name)
return match.groups()
def getlist(page):
tablereg = r'<table class="project_table" cellpadding="0" cellspacing="0">' + '.*?' + r'</table>'
pattern = re.compile(tablereg, re.I|re.S)
match = re.search(pattern, page)
table = match.group()
res = r'<!--.*?-->'
pattern = re.compile(res, re.I)
table = re.sub(pattern, "", table)
theadreg = r'<tr.*</th>.*?</tr>'
pattern = re.compile(theadreg, re.I|re.S)
match = re.search(pattern, table)
thead = match.group()
reg = r'<th.*?>(.*?)</th?'
pattern = re.compile(reg, re.I|re.S)
match = re.findall(pattern, thead)
head = []
for e in match:
head.append(e)
reg = r'<td.*?>(.*?)</td>'
pattern = re.compile(reg, re.I|re.S)
match = re.findall(pattern, table)
td = {}
res = []
i = 0
while i+len(head) <= len(match):
for e in head:
if e == '企业名称':
url_name = getDetailUrl(match[i])
td['企业名称'] = url_name[1]
td['url'] = 'http://www.jnfdc.gov.cn/kfqy/' + url_name[0]
td[e] = match[i]
i += 1
res.append(td)
td = {}
return res
rooturl = "http://www.jnfdc.gov.cn/kfqy/"
values = {"entname":"","levelno":"-1"}
data = urllib.urlencode(values)
pageNum = 21
entlist = []
while True:
if pageNum == 0:
param1 = ""
else:
param1 = "_" + str(pageNum)
url = rooturl + "index" + param1 + ".shtml"
pageNum += 1
geturl = url + "?"+data
request = urllib2.Request(geturl)
response = urllib2.urlopen(request)
page = response.read()
res = getlist(page)
if len(res) == 0:
break
entlist += getlist(page)
import MySQLdb
ip = 'localhost'
username = 'root'
password = '***'
dbname = 'test'
conn = MySQLdb.connect(ip, username, password, dbname, charset='utf8')
cursor = conn.cursor()
print entlist[1]
print 'file...'
f = file("d:\\entinfo.txt", 'w')
for e in entlist:
le = ""
for key,value in e.items():
le += key + ":" + value + ", "
le = le[:-2]
le += '\n'
f.write(le)
f.flush()
f.close
print 'done'