python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。
import sys import tstree fname = 'high_freq_site.list' tree = tstree.TernarySearchTrie() tree.loadData(fname) token = '' counter = 0 post = [] # url, count, posttime for line in sys.stdin: line = line.strip() arr = line.split() if len(arr) != 3: continue #print arr num = arr[1] url = arr[0] posttime = int(arr[2]) if token == '': token = url counter = 0 counter += int(num) post.append(posttime) elif token == url: counter += int(num) post.append(posttime) elif token != url: ret = tree.maxMatch(token) if ret and post: print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post)) token = url counter = 0 counter += int(num) post = [] ret = tree.maxMatch(token) if ret and post: print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post)) class TSTNode(object): def __init__(self, splitchar): self.splitchar = splitchar self.data = None self.loNode = None self.eqNode = None self.hiNode = None class TernarySearchTrie(object): def __init__(self): self.rootNode = None def loadData(self, fname): f = open(fname) while True: line = f.readline() if not line: break line = line.strip() node = self.addWord(line) if node: node.data = line f.close() def addWord(self, word): if not word: return None charIndex = 0 if not self.rootNode: self.rootNode = TSTNode(word[0]) currentNode = self.rootNode while True: charComp = ord(word[charIndex]) - ord(currentNode.splitchar) if charComp == 0: charIndex += 1 if charIndex == len(word): return currentNode if not currentNode.eqNode: currentNode.eqNode = TSTNode(word[charIndex]) currentNode = currentNode.eqNode elif charComp < 0: if not currentNode.loNode: currentNode.loNode = TSTNode(word[charIndex]) currentNode = currentNode.loNode else: if not currentNode.hiNode: currentNode.hiNode = TSTNode(word[charIndex]) currentNode = currentNode.hiNode def maxMatch(self, url): ret = None currentNode = self.rootNode charIndex = 0 while currentNode: if charIndex >= len(url): break charComp = ord(url[charIndex]) - ord(currentNode.splitchar) if charComp == 0: charIndex += 1 if currentNode.data: ret = currentNode.data if charIndex == len(url): return ret currentNode = currentNode.eqNode elif charComp < 0: currentNode = currentNode.loNode else: currentNode = currentNode.hiNode return ret if __name__ == '__main__': import sys fname = 'high_freq_site.list' tree = TernarySearchTrie() tree.loadData(fname) for url in sys.stdin: url = url.strip() ret = tree.maxMatch(url) print ret