
import urllib2
import urllib
import chardet
import thread
import threading
from bs4 import BeautifulSoup as BS

myLock = threading.RLock()
class GetUrls(object):
	pageCount = 1
	search_url = 'http://www.baidu.com/s?wd=key'
	req_header =  {'User-Agent':'Mozilla/5.0(Windows;U;Windows NT 6.1;en-US; rv: Firefox/3.5.6'}
	def __init__(self,inputInfo):
		self.inputInfo = inputInfo
		GetUrls.search_url = GetUrls.search_url.replace('key',self.inputInfo)

	#detect the coding of the html page
	def __detectCode(self,url):
		htmlInfo = urllib.urlopen(url).info()
		coding = htmlInfo.getparam('charset')
		if coding is None:
			htmlInfo = urllib.urlopen(url).read()
			coding = chardet.detect(htmlInfo)['encoding']
			if coding is None:
				coding = 'utf-8'
		coding = coding.lower()
		return coding

	#get the html title 
	def __getTitle(self,url):
		coding = self.__detectCode(url)
			titleReq = urllib2.Request(url,None,GetUrls.req_header)
			titleRes = urllib2.urlopen(titleReq)
			html = titleRes.read()
			titleSoup = BS(html.decode(coding,'ignore'))
			title = titleSoup.title.string
			return title
		except urllib2.HTTPError:
			return None
		except urllib2.URLError:
			return None
	#get the information inside the html page
	def __getInfo(self,redUrl):
		with open('info.txt',mode='a') as a_file:
			title = self.__getTitle(redUrl)
			if title: 
				#print title.encode('gbk','ignore')
				a_file.write(title.encode('gbk','ignore') + '\n')
				a_file.write(redUrl + '\n\n')

	def __searchUrls(self,url):
		if GetUrls.pageCount > 20:
			req = urllib2.Request(url,None,GetUrls.req_header)
			res = urllib2.urlopen(req)
			html = res.read()
			soup = BS(html.decode('utf-8','ignore'))

			#get the 100 urls from page 10 to page 20
			if GetUrls.pageCount > 10:
				htmlList = soup.find_all('h3')
				for hh in htmlList:
					#get url in the html page
					urlInPage = hh.a.get('href')

					#get url after redirecting
						req = urllib2.Request(urlInPage,None,GetUrls.req_header)
						redUrl = urllib2.urlopen(req).geturl()
					except urllib2.HTTPError:
						redUrl = urlInPage
			GetUrls.pageCount += 1
			pNode = soup.find_all('span',text=GetUrls.pageCount)
			nextUrl = 'http://www.baidu.com' + pNode[0].parent.get('href')


	def UrlParse(self):

if __name__ == '__main__':
	getUrlInfo = GetUrls('挖掘机')


