文字识别-一阶隐马尔可夫模型
/** 文字识别 隐马尔可夫模型 共3+n种状态S B Mn E AMap 为状态转移概率矩阵 4*4,表示从{S B M E}到{S B M E}的概率 BMap 为当前字属于某种状态{S B M E}的概率 * */ //有限状态 const S = ['S', 'B', 'Mn', 'E'] const mekflink = { empty: {S: 0.05}, AMap: { 'S-S': 1000, 'S-B': 1000, 'E-B': 1000, 'E-S': 1000, }, BMap: {}, CMap: {},//用于识别结果 AMapGl: {}, BMapGl: {}, add(text, val) { if (text.length > 1) { for (let i = 0; i < text.length; i++) { if (i === 0) { this.push(text[i], 'B') } else { const t0 = i === 1 ? 'B' : 'M' + (i - 1); const t1 = i === text.length - 1 ? 'E' : 'M' + i; this.push(text[i], t1) this.pushState(t0, t1, text[i - 1], text[i], val) } } } else { } }, pushState(t0N, t1N, t0, t1, val) { const AMap = this.AMap; const key = t0N + '-' + t1N if (!AMap[key]) { AMap[key] = 0 } AMap[key]++; const CMap = this.CMap; const key2 = key + '-' + t0 + '-' + t1 if (!CMap[key2]) { CMap[key2] = [] } if (CMap[key2].indexOf(val) === -1) { CMap[key2].push(val) } }, push(key, state) { const BMap = this.BMap if (!BMap[key]) { BMap[key] = { S:1 } } if (!BMap[key][state]) { BMap[key][state] = 0 } BMap[key][state]++; }, //生成模型 makeGl() { const AMap = this.AMap; const BMap = this.BMap; const AMapGl = this.AMapGl; const BMapGl = this.BMapGl; //统计A const AMapT = {} for (let key in AMap) { const [t0, t1] = key.split('-') if (!AMapT[t0]) { AMapT[t0] = 0; } AMapT[t0] = AMapT[t0] + AMap[key]; } for (let key in AMap) { const [t0, t1] = key.split('-') AMapGl[key] = AMap[key] / AMapT[t0] } //统计B for (let key in BMap) { let t = 0; for (let k in BMap[key]) { t = t + BMap[key][k] } const obj = Object.create(this.empty) for (let k in BMap[key]) { obj[k] = BMap[key][k] / t } BMapGl[key] = obj; } return { AMapGl, BMapGl } }, getT1Arr(t0Obj, BObj) { const AMapGl = this.AMapGl; const t1Obj = []; let allgl = 0; for (let t1 in BObj) { let maxLink; t0Obj.forEach(function (link) { const t0 = link.t0; const k = t0 + '-' + t1; if (AMapGl[k]) { const gl = link.gl * AMapGl[k] * BObj[t1] if (gl > 0) { if (!maxLink) { maxLink = { t0: t1, gl: gl, data: link.data } } else if (gl > maxLink.gl) { maxLink = { t0: t1, gl: gl, data: link.data } } } } }) if (maxLink && maxLink.gl * (t1Obj.length + 1) > allgl) { allgl = allgl + maxLink.gl; maxLink.data = maxLink.data + '-' + maxLink.t0 t1Obj.push(maxLink) } } t1Obj.forEach(function (link) { link.gl = link.gl * 100 / allgl; }) return t1Obj; }, fastSolve(text) { const AMapGl = this.AMapGl; const BMapGl = this.BMapGl; const CMap = this.CMap; const len = text.length; // console.log('状态转移概率',AMapGl) // console.log('特征统计概率',BMapGl) //马尔可夫链条 //获取当前状态可能的下一个状态 let t0Obj = [{ t0: 'S', gl: 1, data: 'S' }, { t0: 'B', gl: 1, data: 'B' }] for (let i = 1; i < len; i++) { t0Obj = this.getT1Arr(t0Obj, BMapGl[text[i]] || Object.create(this.empty)) } t0Obj.sort(function (p1, p2) { return p2.gl - p1.gl }) if (t0Obj.length > 0) { const arr = t0Obj[0].data.split('-'); let start; const data = []; for (let i = 0; i < len; i++) { if (arr[i] === 'B') { start = i; } else if (arr[i] === 'E') { data.push([start, i + 1]) } else if (i === len - 1 && arr[i] !== 'S') { data.push([start, i + 1]) } } const result = [] data.forEach(function (tArr) { const kmap = {} let kmax; let premax; let alen = tArr[1] - tArr[0]; for (let i = tArr[0] + 1; i < tArr[1]; i++) { const key = arr[i - 1] + '-' + arr[i] + '-' + text[i - 1] + '-' + text[i]; if (CMap[key]) { CMap[key].forEach(function (k, h) { if (!kmap[k]) { kmap[k] = 1; } else { kmap[k]++; } kmax = kmax || k; if (kmax !== k && kmap[k] >= kmap[kmax]) { premax = kmax; kmax = k; } }) if (premax && kmap[premax] + (alen--) < kmap[kmax]) { break; } } } if (kmax) { result.push(kmax) } }) return result } } } if(typeof module==='object'){ module.exports = mekflink; } console.time('time') // // demo const arrH=[{ h:'ddm6bieggfge6222', t:'文', },{ h:'ddm6biegdfge6222', t:'字', },{ h:'ddm6biegefge6222', t:'识', },{ h:'ddm6biegsfge6222', t:'别', },{ h:'ddm6biefgfge6222', t:'!', }] arrH.forEach(function (item) { mekflink.add(item.h,item.t) }) mekflink.makeGl() const text='ddm6bieggfge622212121ddm6biegdfge6222sfsaffsafddm6biegefge6222adfadddm6biegsfge6222ddm6biefgfge6222ddm6biefgf6'; const arr=mekflink.fastSolve(text); console.timeEnd('time') console.log(arr)