机器学习 — 搜索及排名

简单的搜索引擎

核心思想就是

  1. 爬取指定页面,提取出页面中的url,进行递归爬取,可以指定递归深度
  2. 提取网页中的文字内容,根据一定规则进行分词,保存在数据库中,分出的单词和url对应存储
  3. 对查询参数分词,然后查询数据库中各个单词对应的url,然后返回

对搜索结果进行排名:

  1. 基于内容对搜索结果进行排序
    1. 单词频度
    2. 文档位置
    3. 单词距离
  2. 利用外部回指链接排名
    1. PageRank
    2. 使用链接文本
  3. 利用人工神经网络进行排名
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
import sqlite3
import re

# 需要被忽略的单词
ignorewords = set(['the', 'of', 'a', 'and', 'to', 'in', 'is', 'it'])

class crawler:
    def __init__(self, dbname):
        self.con = sqlite3.connect(dbname)
    def __delete__(self):
        self.con.close()
    def dbcommit(self):
        self.con.commit()
    def createindextables(self): 
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()
    
    # 分词并转化为小写,这里只是简单的按照非字母非数字分割    
    def separatewords(self, text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s != '']
        
    def addtoindex(self, url, soup):
        if self.isindexed(url):
            return
        
        # 分出页面内的单词
        text = self.gettextonly(soup)
        words = self.separatewords(text)
        
        # 得到url对应的id
        urlid = self.getentryid('urllist', 'url', url)
        
        # 将每个单词和url关联起来
        for i in range(len(words)):
            word = words[i]
            if word in ignorewords:
                continue
            wordid = self.getentryid('wordlist', 'word', word)
            self.con.execute("insert into wordlocation(urlid, wordid, location) values(%d, %d, %d)" 
                             % (urlid, wordid, i))
    
    def getentryid(self, table, field, value, createnew=True):
        cur = self.con.execute("select rowid from %s where %s='%s'" % (table, field, value))
        res = cur.fetchone()
        if res == None:
            cur = self.con.execute("insert into %s(%s) values('%s')" % (table, field, value))
            return cur.lastrowid
        else:
            return res[0]
        
    def isindexed(self, url):
        u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
        if u != None:
            v = self.con.execute("select * from wordlocation where urlid=%d" % u[0]).fetchone()
            if v != None:
                return True
        else:
            return False

    # 找到标签内的文字
    def gettextonly(self, soup):
        v = soup.string
        if v == None:
            c = soup.contents
            resulttext = ''
            for t in c:
                subtext = self.gettextonly(t)
                resulttext += subtext + '\n'
            return resulttext
        else:
            return v.strip()
   
    def addlinkref(self,urlFrom,urlTo,linkText):
        words=self.separatewords(linkText)
        fromid=self.getentryid('urllist','url',urlFrom)
        toid=self.getentryid('urllist','url',urlTo)
        if fromid==toid: return
        cur=self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid,toid))
        linkid=cur.lastrowid
        for word in words:
            if word in ignorewords: 
                continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid))
    def crawl(self, pages, depth=2):
        # 爬取深度
        for i in range(depth):
            newpages = set()
            # 依次打开每个page
            for page in pages:
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "could not open %s" % page
                    continue
                soup = BeautifulSoup(c)
                # 添加index到数据库
                self.addtoindex(page, soup)
                
                # 找出所有的链接
                links = soup('a')
                for link in links:
                    if 'href' in dict(link.attrs):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if url[0:4] == 'http' and not self.isindexed(url):
                            newpages.add(url)
                        linktext = self.gettextonly(link)
                        self.addlinkref(page, url, linktext)
                self.dbcommit()
            pages = newpages
pages = ["http://www.iplaypy.com/jichu/set.html"]
crawler = crawler('test.db')
crawler.crawl(pages)
#crawler.createindextables()
import sqlite3

import nn


network = nn.searchnet('nn.db')

# 搜索
class searcher:
    def __init__(self, dbname):
        self.con = sqlite3.connect(dbname)
        
    def __delete__(self):
        self.con.close()
    
    def getmatchrows(self, query):
        # 构造查询字符串
        fieldlist = 'w0.urlid'
        tablelist = ''
        clauselist = ''
        wordids = []
        
        # 根据空格拆分单词
        words = query.split(' ')
        print words
        tablenumber = 0
        
        for word in words:
            # 获取单词对应的id
            wordrow = self.con.execute("select rowid from wordlist where word='%s'" % word).fetchone()
            if wordrow != None:
                wordids.append(wordrow[0])
                print tablenumber
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += 'w%d.urlid=w%d.urlid and ' % (tablenumber-1, tablenumber)
                fieldlist += ', w%d.location' % tablenumber
                tablelist += 'wordlocation w%d' % tablenumber
                clauselist += 'w%d.wordid=%d' % (tablenumber, wordrow[0])
                tablenumber += 1
        
        # 拼接查询语句
        fullquery = 'select %s from %s where %s' % (fieldlist, tablelist, clauselist)
        print fullquery
        cur = self.con.execute(fullquery)
        rows = [row for row in cur]
        
        return rows, wordids
    
    # 计算每个url对应的比重
    def getscoredlist(self, rows, wordids):
        totalscores = dict([(row[0], 0) for row in rows])
        
        # 评价函数
        weights = [(0.2, self.locationscore(rows)), (0.3, self.frequencyscore(rows)), 
                  (0.1, self.distancescore(rows)), (0.2, self.pagerankscore(rows)),
                  (0.2, self.linktextscore(rows, wordids))]
        
        for (weight, scores) in weights:
            for url in totalscores:
                totalscores[url] += weight * scores[url]
        return totalscores
    
    def geturlname(self, id):
        return self.con.execute('select url from urllist where rowid=%d' % id).fetchone()[0]
    
    # 返回带有评分的结果
    def query(self, query):
        rows, wordids = self.getmatchrows(query)
        scores = self.getscoredlist(rows, wordids)
        rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
        for (score, urlid) in rankedscores[0:10]:
            print '%f\t%s' % (score, self.geturlname(urlid))
    
    # 归一化函数,将各种评价标准进行归一化处理,将score缩放在0-1区间
    def nomalizescores(slef, scores, smallisbetter=0):
        # 如果被除数是0,则用一个较小的值代替,避免除0
        vsmall = 0.00001
        if smallisbetter:
            minscore = min(scores.values())
            return dict([(u, float(minscore) / max(vsmall, l)) for (u, l) in scores.items()])
        else:
            maxscore = max(scores.values())
            if maxscore == 0:
                maxscore = vsmall
            return dict([(u, float(c) / maxscore) for (u, c) in scores.items()])
        
    # 单词频度,统计出现相同url个数
    def frequencyscore(self, rows):
        counts = dict([(row[0], 0) for row in rows])
        for row in rows:
            counts[row[0]] += 1
        return self.nomalizescores(counts)
    
    # 文档距离,越靠前的url越有用
    def locationscore(self, rows):
        # 设置单词位置的上限为1000000
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]:
                locations[row[0]] = loc
        return self.nomalizescores(locations, smallisbetter=1)
    
    # 单词距离,两个单词之间的距离越近score越高
    def distancescore(self, rows):
        # 如果只有一个查询单词,score都一样,就直接返回
        if len(rows[0]) <= 2:
            return dict([(row[0], 1.0) for row in rows])
        
        mindistance = dict([(row[0], 1000000) for row in rows])
        
        for row in rows:
            dist = sum([abs(row[i] -row[i-1]) for i in range(2, len(row))])
            if dist < mindistance[row[0]]:
                mindistance[[row[0]]] = dist
        
        return self.nomalizescores(mindistance)
    
    # 利用外部连接个数评价——简单计数,计算每个url被链接的次数
    def inboundlinkscore(self, rows):
        uniqueurls = set(row[0] for row in rows)
        inboundcount = dict([(url, self.con.execute('select count(*) from link where toid=%d'
                                                    % u).fetchone()[0]) for u in uniqueurls])
        return self.nomalizescores(inboundcount)
    
    # PageRank算法,每个url对应的PageRank值
    def calculatepagerank(self, iterations=20):
        self.con.execute('drop table if exists pagerank')
        self.con.execute('create table pagerank(urlid primary key, score)')
        
        # 初始化每个url的rank为1
        self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
        self.con.commit()
        
        # 更新rank值
        for i in range(iterations):
            print 'Iterator %d' % i
            for (urlid,) in self.con.execute('select rowid from urllist'):
                # rank最小值
                pr = 0.15
                
                # 遍历所有指向该网页的其他网页
                for (linker,) in self.con.execute('select distinct fromid from link where toid=%d' % urlid):
                    # 得到链接源对应网页的pagerank值
                    linkingpr = self.con.execute('select score from pagerank where urlid=%d' % linker).fetchone()[0]
                    
                    # 求出该网页上总的链接数
                    linkingcount = self.con.execute('select count(*) from link where fromid=%d' % linker).fetchone()[0]
                    
                    pr += 0.85 * (linkingpr / linkingcount)
                    
                # 更新数据库中urlid对应url的pagerank
                self.con.execute('update pagerank set score=%f where urlid=%d' % (pr, urlid))
                self.con.commit()
    
    # 利用外部回指链接——PageRank
    def pagerankscore(self, rows):
        pageranks = dict([(row[0], self.con.execute('select score from pagerank where urlid=%d' 
                                                    % row[0]).fetchone()[0]) for row in rows])  
        return self.nomalizescores(pageranks)
    
    # 利用链接文本,如果搜索单词出现在指向目标地址的链接文本中,则将该链接的pagerank加到目标链接的pagerank上
    def linktextscore(self, rows, wordids):
        linkscores = dict([(row[0], 0) for row in rows])
        for wordid in wordids:
            cur = self.con.execute('select link.fromid, link.toid from linkwords,link where wordid=%d and linkwords.rowid=link.rowid' % wordid)
            for (fromid, toid) in cur:
                if toid in linkscores:
                    pr = self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
                    linkscores[toid] += pr
        return self.nomalizescores(linkscores)
    
    def nnscore(self, rows, wordids):
        # 获得一个由唯一的urlid构成的有序列表
        urlids = [urlid for urlid in set([row[0] for row in rows])]
        nnres = network.getresult(wordids, urlids)
        scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
        return self.nomalizescores(scores)
        
searcher = searcher('test.db')
#searcher.getmatchrows('set Python')
#searcher.calculatepagerank()
searcher.query('set')
['set']
0
select w0.urlid, w0.location from wordlocation w0 where w0.wordid=4
0.798721	http://www.iplaypy.com/jichu/set.html
0.356789	http://www.iplaypy.com/jichu/
0.342161	http://www.iplaypy.com/jichu/var.html
0.341273	http://www.iplaypy.com/jichu/dict.html
0.339879	http://www.iplaypython.com/jichu/dict.html
0.328156	http://www.iplaypy.com/jichu/dir.html
0.328135	http://www.iplaypy.com/jichu/note.html
0.328107	http://www.iplaypy.com/jichu/function.html
0.328074	http://www.iplaypy.com/jichu/int.html
0.328048	http://www.iplaypy.com/jichu/class.html

神经网络

输入 —> 神经网络层(可以包含多层,每一层有多个节点) —> 输出

神经网络层需要大量的输入输出来训练,如果一对输入输出在神经网络网络层没有一个对应的节点,会添加一个,如果有会更新“输入——节点”、“节点——输出”的权重值,这些权重值就是一个矩阵,经过大量数据的训练每个权重值越来越接近真实值
训练过程:根据神经网络中的权重值和输入值计算出输出值,然后对比输出值和给定的输入对应的输出值,重新矫正权重矩阵

from math import tanh
import sqlite3

def dtanh(y):
        return 1.0 - y * y
    
class searchnet:
    def __init__(self, dbname):
        self.con = sqlite3.connect(dbname)
    
    def __delete__(self):
        self.con.close()
    
    # 创建表
    def maketables(self):
        self.con.execute('create table hiddennode(create_key)')
        self.con.execute('create table wordhidden(fromid, toid, strength)')
        self.con.execute('create table hiddenurl(fromid, toid, strength)')
        self.con.commit()
    
    # 查询strength
    def getstrength(self, fromid, toid, layer):
        if layer == 0:
            tablename = 'wordhidden'
        else:
            tablename = 'hiddenurl'
        res = self.con.execute('select strength from %s where fromid=%d and toid=%d' % (tablename, fromid, toid)).fetchone()
        if res == None:
            if layer == 0:
                return -0.2
            if layer == 1:
                return 0
        
        return res[0]
    
    # 更新连接strength或者新建链接
    def setstrength(self, fromid, toid, layer, strength):
        if layer == 0:
            tablename = 'wordhidden'
        else:
            tablename = 'hiddenurl'
        res = self.con.execute('select strength from %s where fromid=%d and toid=%d' % (tablename, fromid, toid)).fetchone()
        if res == None:
            self.con.execute('insert into %s (fromid, toid, strength) values(%d, %d, %s)' % (tablename, fromid, toid, strength))
        else:
            self.con.execute('update %s set strength=%f where rowid=%d' % (tablename, strength, res[0]))
            
    # 创建hiddennode
    def generatehiddennode(self, wordids, urls):
        if len(wordids) > 3:
            return None
        create_key = '_'.join(sorted([str(wordid) for wordid in wordids]))
        res = self.con.execute("select rowid from hiddennode where create_key='%s'" % create_key).fetchone()
        
        # 如果没有该node则新建
        if res == None:
            cur = self.con.execute("insert into hiddennode (create_key) values('%s')" % create_key)
            hiddenid = cur.lastrowid
            # 设置输入端默认权重
            for wordid in wordids:
                self.setstrength(wordid, hiddenid, 0, 1.0/len(wordids))
                
            # 设置输出端默认权重
            for url in urls:
                self.setstrength(hiddenid, url, 1, 0.1)
            self.con.commit()
    # 获取所有的hiddenid
    def getallhiddenids(self, wordids, urlids):
        ll = {}
        for wordid in wordids:
            cur = self.con.execute('select rowid from wordhidden where fromid=%d' % wordid)
            for row in cur:
                ll[row[0]] = 1
        for urlid in urlids:
            cur = self.con.execute('select rowid from hiddenurl where toid=%d' % urlid)
            for row in cur:
                ll[row[0]] = 1
                
        return ll.keys()
    
    # 创建神经网络
    def setupnetwork(self, wordids, urlids):
        # 值列表
        self.wordids = wordids
        self.hiddenids = self.getallhiddenids(wordids, urlids)
        self.urlids = urlids
        
        # 节点输出
        self.ai = [1.0] * len(self.wordids)
        self.ah = [1.0] * len(self.hiddenids)
        self.ao = [1.0] * len(self.urlids)
        print self.ao
        
        # 建立权重矩阵
        self.wi = [[self.getstrength(wordid, hiddenid, 0) for hiddenid in self.hiddenids]
                  for wordid in self.wordids]
        self.wo = [[self.getstrength(hiddenid, urlid, 1) for urlid in self.urlids]
                  for hiddenid in self.hiddenids]
    # 前馈算法    
    def feedforward(self):
        # 查询单词作为输入
        for i in range(len(self.wordids)):
            self.ai[i] = 1.0
            
        # hidden层节点的活跃程度
        for j in range(len(self.hiddenids)):
            summ = 0.0
            for i in range(len(self.wordids)):
                summ += self.ai[i] * self.wi[i][j]
            self.ah[j] = tanh(summ)
        
        # 输出层节点的活跃程度
        for k in range(len(self.urlids)):
            summ = 0.0
            for i in range(len(self.hiddenids)):
                summ += self.ah[i] * self.wo[i][k]
            
            self.ao[k] = tanh(summ)
            
        return self.ao[:]
    
    def getresult(self, wordids, urlids):
        self.setupnetwork(wordids, urlids)
        return self.feedforward()
    
    # 反向传播算法
    def backpropagete(self, targets, N=0.5):
        # 计算输出层误差
        output_deltas = [0.0] * len(self.urlids)
        for k in range(len(self.urlids)):
            error = targets[k] - self.ao[k]
            output_deltas[k] = dtanh(self.ao[k]) * error
        
        # 计算隐藏层误差
        hidden_deltas = [0.0] * len(self.hiddenids)
        for j in range(len(self.hiddenids)):
            error = 0.0
            for k in range(len(self.urlids)):
                error = error + output_deltas[k] * self.wo[j][k]
            hidden_deltas[j] = dtanh(self.ah[j]) * error
            
        # 更新输出权重
        for j in range(len(self.hiddenids)):
            for k in range(len(self.urlids)):
                change = output_deltas[k] * self.ah[j]
                self.wo[j][k] += N * change
        
        # 更新输入权重
        for i in range(len(self.wordids)):
            for k in range(len(self.hiddenids)):
                change = hidden_deltas[k] * self.ai[i]
                self.wi[i][k] += N * change
            
    
    # 训练神经网络
    def trainquery(self, wordids, urlids, selectedurl):
        # 生成一个隐藏节点
        self.generatehiddennode(wordids, urlids)
        self.setupnetwork(wordids, urlids)
        self.feedforward()
        targets = [0.0] * len(urlids)
        targets[urlids.index(selectedurl)] = 1.0
        self.backpropagete(targets)
        self.updatedatabase()
        
    # 更新数据库中的权重值
    def updatedatabase(self):
        for i in range(len(self.wordids)):
            for j in range(len(self.hiddenids)):
                self.setstrength(self.wordids[i], self.hiddenids[j], 0, self.wi[i][j])
        
        for j in range(len(self.hiddenids)):
            for k in range(len(self.urlids)):
                self.setstrength(self.hiddenids[j], self.urlids[k], 1, self.wo[j][k])
        self.con.commit()
net = searchnet('nn.db')
#net.maketables()
wWorld, wRiver, wBank = 101, 102, 103
uWorldBank, uRiver, uEarth = 201, 202, 203
#net.generatehiddennode([wWorld, wBank], [uWorldBank, uRiver, uEarth])
#for c in net.con.execute('select * from wordhidden'): 
#    print c

#for c in net.con.execute('select * from hiddenurl'): 
#    print c
net.trainquery([wWorld, wBank], [uWorldBank, uRiver, uEarth], uWorldBank)
net.getresult([wWorld, wBank], [uWorldBank, uRiver, uEarth])

[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]





[0.7775224145252707, -0.0110282659654087, -0.0110282659654087]
posted @ 2017-03-14 21:20  lacker  阅读(1091)  评论(0编辑  收藏  举报