numpy初用
import numpy as np
for k,v in stat.iteritems():
print k
v.sort()
#v = v[len(v)*3/100:len(v)*97/100]
data = np.array(v)
hist,bins=np.histogram(data,bins=np.linspace(data.min(),data.max(),20))
#hist,bins=np.histogram(data,bins=20)
print data.min(), data.max(), np.average(data)
print hist
print bins
print v[0::len(v)/20]
with open('candidate_words') as f:
for nline, line in enumerate(f):
line = line.strip()
terms = line.split(' ')
query_vocab[''.join(terms)] = nline
vecs = [vec_space[term] for term in terms if term in vec_space]
weights = [term_imp_dict.get(term, 0.0) for term in terms if term in vec_space]
#if len(vecs) == 1:
# terms_vec = vecs[0]
#else:
# terms_vec = [sum(x) for x in izip(*vecs)]
#terms_vec = [elem/len(terms_vec) for elem in terms_vec]
if not vecs:
continue
if max(weights) == 0:
continue
vecs = np.array(vecs)
terms_vec = np.average(vecs, axis=0, weights=weights)
terms_vec_len = np.linalg.norm(terms_vec)
ters_vec = terms_vec / terms_vec_len
aindex.add_item(nline, terms_vec.tolist())