1. 定义函数解析产品数据
import re
DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
def removeQuotes(s):
return ''.join(i for i in s if i!='"')
def parseDatafileLine(datafileLine):
match = re.search(DATAFILE_PATTERN, datafileLine)
if match is None:
print 'Invalid datafile line: %s' % datafileLine
return (datafileLine, -1)
elif match.group(1) == '"id"':
print 'Header datafile line: %s' % datafileLine
return (datafileLine, 0)
else:
product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
return ((removeQuotes(match.group(1)), product), 1)
2. 创建初始产品数据RDD
import sys
import os
baseDir = os.path.join('/data')
inputPath = os.path.join('12', '4')
GOOGLE_PATH = 'Google.csv'
GOOGLE_SMALL_PATH = 'Google_small.csv'
AMAZON_PATH = 'Amazon.csv'
AMAZON_SMALL_PATH = 'Amazon_small.csv'
GOLD_STANDARD_PATH = 'Amazon_Google_perfectMapping.csv'
STOPWORDS_PATH = 'stopwords.txt'
def loadData(path):
filename = os.path.join(baseDir, inputPath, path)
raw = (sc
.textFile(filename, 4, 0)
.map(parseDatafileLine)
.cache())
failed = (raw
.filter(lambda s: s[1] == -1)
.map(lambda s: s[0]))
for line in failed.take(10):
print '%s - Invalid datafile line: %s' % (path, line)
valid = (raw
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
print '%s - Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (path,
raw.count(),
valid.count(),
failed.count())
return valid
googleSmall = loadData(GOOGLE_SMALL_PATH)
google = loadData(GOOGLE_PATH)
amazonSmall = loadData(AMAZON_SMALL_PATH)
amazon = loadData(AMAZON_PATH)
for line in googleSmall.take(3):
print 'google: %s: %s\n' % (line[0], line[1])
for line in amazonSmall.take(3):
print 'amazon: %s: %s\n' % (line[0], line[1])
3. 定义文本记号化函数
quickbrownfox = 'A quick brown fox jumps over the lazy dog.'
split_regex = r'\W+'
def simpleTokenize(string):
return [x for x in re.split(split_regex, string.lower()) if x != '']
print simpleTokenize(quickbrownfox)
4. 去除停止词
stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH)
stopwords = set(sc.textFile(stopfile).collect())
print 'These are the stopwords: %s' % stopwords
def tokenize(string):
return [x for x in re.split(split_regex, string.lower()) if x != '' and x not in stopwords]
print tokenize(quickbrownfox)
5. 记号化两个小数据集
amazonRecToToken = amazonSmall.map(lambda x : (x[0], tokenize(x[1])))
googleRecToToken = googleSmall.map(lambda x : (x[0], tokenize(x[1])))
def countTokens(vendorRDD):
return vendorRDD.flatMap(lambda x : x[1]).count()
totalTokens = countTokens(amazonRecToToken) + countTokens(googleRecToToken)
print 'There are %s tokens in the combined datasets' % totalTokens
6. (练习)查看亚马逊产品小数据集中记号数最多的产品记录
7. 定义计算TF的函数
def tf(tokens):
N = float(len(tokens))
tokenDict = {}
for token in tokens:
tokenDict[token] = tokenDict[token] + 1 if token in tokenDict.keys() else 1
for k in tokenDict.keys():
tokenDict[k] = tokenDict[k] / N
return tokenDict
print tf(tokenize(quickbrownfox))
8. 创建谷歌和亚马逊小数据集的并集RDD
corpusRDD = amazonRecToToken .union(googleRecToToken)
9. 计算所有记号的IDF
def idfs(corpus):
N = float(corpus.count())
uniqueTokens = corpus.map(lambda x : list(set(x[1])))
tokenCountPairTuple = uniqueTokens.flatMap(lambda tokens : [(t, 1) for t in tokens])
tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a, b : a + b)
return (tokenSumPairTuple.map(lambda (t, f) : (t, N / f)))
idfsSmall = idfs(corpusRDD)
uniqueTokenCount = idfsSmall.count()
print 'There are %s unique tokens in the small datasets.' % uniqueTokenCount
print idfsSmall.takeOrdered(5, lambda s: s[1])
10. 定义计算TF-IDF的函数
def tfidf(tokens, idfs):
tfIdfDict = {}
tfs = tf(tokens)
for token in tfs.keys():
tfIdfDict[token] = tfs[token] * idfs[token]
return tfIdfDict
recb000hkgj8k = amazonRecToToken.filter(lambda x: x[0] == 'b000hkgj8k').collect()[0][1]
idfsSmallWeights = idfsSmall.collectAsMap()
rec_b000hkgj8k_weights = tfidf(recb000hkgj8k, idfsSmallWeights)
print 'Amazon record "b000hkgj8k" has tokens and weights:\n%s' % rec_b000hkgj8k_weights
11. 定义计算余弦相似度的函数
import math
def dotprod(a, b):
dp = 0.0
for k in set(a.keys()).intersection(set(b.keys())):
dp += a[k] * b[k]
return dp
def norm(a):
x = 0.0
for k in a.keys():
x += a[k] ** 2
return math.sqrt(x)
def cossim(a, b):
return dotprod(a, b) / (norm(a) * norm(b))
testVec1 = {'foo': 2, 'bar': 3, 'baz': 5 }
testVec2 = {'foo': 1, 'bar': 0, 'baz': 20 }
dp = dotprod(testVec1, testVec2)
nm = norm(testVec1)
print dp, nm
12. 定义计算TF-IDF余弦相似度的函数
def cosineSimilarity(string1, string2, idfsDictionary):
w1 = tfidf(tokenize(string1), idfsDictionary)
w2 = tfidf(tokenize(string2), idfsDictionary)
return cossim(w1, w2)
cossimAdobe = cosineSimilarity('Adobe Photoshop',
'Adobe Illustrator',
idfsSmallWeights)
print cossimAdobe
13. 进行实体解析
crossSmall = (googleSmall
.cartesian(amazonSmall)
.cache())
def computeSimilarity(record):
googleRec = record[0]
amazonRec = record[1]
googleURL = googleRec[0]
amazonID = amazonRec[0]
googleValue = googleRec[1]
amazonValue = amazonRec[1]
cs = cosineSimilarity(googleValue, amazonValue, idfsSmallWeights)
return (googleURL, amazonID, cs)
similarities = (crossSmall
.map(computeSimilarity)
.cache())
def similar(amazonID, googleURL):
return (similarities
.filter(lambda record: (record[0] == googleURL and record[1] == amazonID))
.collect()[0][2])
similarityAmazonGoogle = similar('b000o24l3q', 'http://www.google.com/base/feeds/snippets/17242822440574356561')
print 'Requested similarity is %s.' % similarityAmazonGoogle
14. 产品真实匹配数据集
GOLDFILE_PATTERN = '^(.+),(.+)'
def parse_goldfile_line(goldfile_line):
match = re.search(GOLDFILE_PATTERN, goldfile_line)
if match is None:
print 'Invalid goldfile line: %s' % goldfile_line
return (goldfile_line, -1)
elif match.group(1) == '"idAmazon"':
print 'Header datafile line: %s' % goldfile_line
return (goldfile_line, 0)
else:
key = '%s %s' % (removeQuotes(match.group(1)), removeQuotes(match.group(2)))
return ((key, 'gold'), 1)
goldfile = os.path.join(baseDir, inputPath, GOLD_STANDARD_PATH)
gsRaw = (sc
.textFile(goldfile)
.map(parse_goldfile_line)
.cache())
gsFailed = (gsRaw
.filter(lambda s: s[1] == -1)
.map(lambda s: s[0]))
for line in gsFailed.take(10):
print 'Invalid goldfile line: %s' % line
goldStandard = (gsRaw
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
print 'Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (gsRaw.count(),
goldStandard.count(),
gsFailed.count())
15. 统计谷歌和亚马逊小数据集中匹配的产品数
sims = similarities.map(lambda x : (x[1] + ' ' + x[0], x[2]))
trueDupsRDD = (sims
.join(goldStandard)
.map(lambda x : (x[0], x[1][0])))
trueDupsCount = trueDupsRDD.count()
print 'There are %s true duplicates.' % trueDupsCount
16. (练习)统计匹配的产品的平均余弦相似度
17. (练习)统计不匹配的产品的平均余弦相似度
18. (练习)优化算法并尝试亚马逊和谷歌产品标准数据集
[附录:输入代码清单]
1. 定义函数解析电影评分和电影信息数据
def get_ratings_tuple(entry):
items = entry.split('::')
return int(items[0]), int(items[1]), float(items[2])
def get_movie_tuple(entry):
items = entry.split('::')
return int(items[0]), items[1]
2. 创建电影评分和电影信息数据RDD
import sys
import os
baseDir = os.path.join('/data')
inputPath = os.path.join('12', '5')
ratingsFilename = os.path.join(baseDir, inputPath, 'ratings.dat.gz')
moviesFilename = os.path.join(baseDir, inputPath, 'movies.dat')
numPartitions = 2
rawRatings = sc.textFile(ratingsFilename).repartition(numPartitions)
rawMovies = sc.textFile(moviesFilename)
ratingsRDD = rawRatings.map(get_ratings_tuple).cache()
moviesRDD = rawMovies.map(get_movie_tuple).cache()
ratingsCount = ratingsRDD.count()
moviesCount = moviesRDD.count()
print 'There are %s ratings and %s movies in the datasets' % (ratingsCount, moviesCount)
print 'Ratings: %s' % ratingsRDD.take(3)
print 'Movies: %s' % moviesRDD.take(3)
3. 定义按键值排序函数
def sortFunction(tuple):
key = unicode('%.3f' % tuple[0])
value = tuple[1]
return (key + ' ' + value)
4. 定义计算评分数和平均评分的函数
def getCountsAndAverages(IDandRatingsTuple):
ratingsCount = len(IDandRatingsTuple[1])
ratingsSum = float(sum(IDandRatingsTuple[1]))
return (IDandRatingsTuple[0], (ratingsCount, ratingsSum / ratingsCount))
getCountsAndAverages((1, (1, 2, 3, 4)))
5. 选取评分最高的电影
movieIDsWithRatingsRDD = (ratingsRDD
.map(lambda x : (x[1], x[2]))
.groupByKey())
movieIDsWithRatingsRDD.take(3)
print 'movieIDsWithRatingsRDD: %s\n' % movieIDsWithRatingsRDD.take(3)
movieIDsWithAvgRatingsRDD = movieIDsWithRatingsRDD.map(getCountsAndAverages)
print 'movieIDsWithAvgRatingsRDD: %s\n' % movieIDsWithAvgRatingsRDD.take(3)
movieNameWithAvgRatingsRDD = (moviesRDD
.join(movieIDsWithAvgRatingsRDD)
.map(lambda x : (x[1][1][1], x[1][0], x[1][1][0])))
print 'movieNameWithAvgRatingsRDD: %s\n' % movieNameWithAvgRatingsRDD.take(3)
6. 选取有超过500条评分记录评分最高的电影
movieLimitedAndSortedByRatingRDD = (movieNameWithAvgRatingsRDD
.filter(lambda x : x[2] > 500)
.sortBy(sortFunction, False))
print 'Movies with highest ratings: %s' % movieLimitedAndSortedByRatingRDD.take(20)
7. 创建协同过滤推荐的训练集
trainingRDD, validationRDD, testRDD = ratingsRDD.randomSplit([6, 2, 2], seed=0L)
print 'Training: %s, validation: %s, test: %s\n' % (trainingRDD.count(),
validationRDD.count(),
testRDD.count())
8. 定义计算均方根误差的函数
import math
def computeError(predictedRDD, actualRDD):
predictedReformattedRDD = predictedRDD.map(lambda x : ((x[0], x[1]), x[2]))
actualReformattedRDD = actualRDD.map(lambda x : ((x[0], x[1]), x[2]))
squaredErrorsRDD = (predictedReformattedRDD
.join(actualReformattedRDD)
.map(lambda x : (x[1][0] - x[1][1]) ** 2))
totalError = squaredErrorsRDD.sum()
numRatings = squaredErrorsRDD.count()
return math.sqrt(float(totalError) / numRatings)
testPredicted = sc.parallelize([
(1, 1, 5),
(1, 2, 3),
(1, 3, 4),
(2, 1, 3),
(2, 2, 2),
(2, 3, 4)])
testActual = sc.parallelize([
(1, 2, 3),
(1, 3, 5),
(2, 1, 5),
(2, 2, 1)])
testError = computeError(testPredicted, testActual)
print 'Error for test dataset: %s' % testError
9. 训练预测模型
from pyspark.mllib.recommendation import ALS
validationForPredictRDD = validationRDD.map(lambda x : (x[0], x[1]))
seed = 5L
iterations = 5
regularizationParameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.03
minError = float('inf')
bestRank = -1
bestIteration = -1
for rank in ranks:
model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations,
lambda_=regularizationParameter)
predictedRatingsRDD = model.predictAll(validationForPredictRDD)
error = computeError(predictedRatingsRDD, validationRDD)
errors[err] = error
err += 1
print 'For rank %s the RMSE is %s' % (rank, error)
if error < minError:
minError = error
bestRank = rank
print 'The best model was trained with rank %s' % bestRank
10. 测试预测模型
myModel = ALS.train(trainingRDD, ranks[2], seed=seed, iterations=iterations,
lambda_=regularizationParameter)
testForPredictingRDD = testRDD.map(lambda x : (x[0], x[1]))
predictedTestRDD = myModel.predictAll(testForPredictingRDD)
testRMSE = computeError(testRDD, predictedTestRDD)
print 'The model had a RMSE on the test set of %s' % testRMSE
11. 比较模型性能
trainingAvgRating = trainingRDD.map(lambda x : x[2]).sum() / float(trainingRDD.map(lambda x : x[2]).count())
print 'The average rating for movies in the training set is %s' % trainingAvgRating
testForAvgRDD = testRDD.map(lambda x : (x[0], x[1], trainingAvgRating))
testAvgRMSE = computeError(testRDD, testForAvgRDD)
print 'The RMSE on the average set is %s' % testAvgRMSE