1. 定义函数解析产品数据

import re
DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'

def removeQuotes(s):
    return ''.join(i for i in s if i!='"')

def parseDatafileLine(datafileLine):
    match = re.search(DATAFILE_PATTERN, datafileLine)
    if match is None:
        print 'Invalid datafile line: %s' % datafileLine
        return (datafileLine, -1)
    elif match.group(1) == '"id"':
        print 'Header datafile line: %s' % datafileLine
        return (datafileLine, 0)
        product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
        return ((removeQuotes(match.group(1)), product), 1)

2. 创建初始产品数据RDD

import sys
import os

baseDir = os.path.join('/data')
inputPath = os.path.join('12', '4')

GOOGLE_PATH = 'Google.csv'
GOOGLE_SMALL_PATH = 'Google_small.csv'
AMAZON_PATH = 'Amazon.csv'
AMAZON_SMALL_PATH = 'Amazon_small.csv'
GOLD_STANDARD_PATH = 'Amazon_Google_perfectMapping.csv'
STOPWORDS_PATH = 'stopwords.txt'

def loadData(path):
    filename = os.path.join(baseDir, inputPath, path)
    raw = (sc
           .textFile(filename, 4, 0)
    failed = (raw
              .filter(lambda s: s[1] == -1)
              .map(lambda s: s[0]))
    for line in failed.take(10):
        print '%s - Invalid datafile line: %s' % (path, line)
    valid = (raw
             .filter(lambda s: s[1] == 1)
             .map(lambda s: s[0])
    print '%s - Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (path,
    return valid

googleSmall = loadData(GOOGLE_SMALL_PATH)
google = loadData(GOOGLE_PATH)
amazonSmall = loadData(AMAZON_SMALL_PATH)
amazon = loadData(AMAZON_PATH)

for line in googleSmall.take(3):
    print 'google: %s: %s\n' % (line[0], line[1])

for line in amazonSmall.take(3):
    print 'amazon: %s: %s\n' % (line[0], line[1])

3. 定义文本记号化函数

quickbrownfox = 'A quick brown fox jumps over the lazy dog.'
split_regex = r'\W+'

def simpleTokenize(string):
    return [x for x in re.split(split_regex, string.lower()) if x != '']

print simpleTokenize(quickbrownfox)

4. 去除停止词

stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH)
stopwords = set(sc.textFile(stopfile).collect())
print 'These are the stopwords: %s' % stopwords

def tokenize(string):
    return [x for x in re.split(split_regex, string.lower()) if x != '' and x not in stopwords]

print tokenize(quickbrownfox)

5. 记号化两个小数据集

amazonRecToToken = amazonSmall.map(lambda x : (x[0], tokenize(x[1])))
googleRecToToken = googleSmall.map(lambda x : (x[0], tokenize(x[1])))

def countTokens(vendorRDD):
    return vendorRDD.flatMap(lambda x : x[1]).count()

totalTokens = countTokens(amazonRecToToken) + countTokens(googleRecToToken)
print 'There are %s tokens in the combined datasets' % totalTokens

6. (练习)查看亚马逊产品小数据集中记号数最多的产品记录

7. 定义计算TF的函数

def tf(tokens):
    N = float(len(tokens))
    tokenDict = {}
    for token in tokens:
        tokenDict[token] = tokenDict[token] + 1 if token in tokenDict.keys() else 1
    for k in tokenDict.keys():
        tokenDict[k] = tokenDict[k] / N
    return tokenDict

print tf(tokenize(quickbrownfox)) 

8. 创建谷歌和亚马逊小数据集的并集RDD

corpusRDD = amazonRecToToken .union(googleRecToToken)

9. 计算所有记号的IDF

def idfs(corpus):
    N = float(corpus.count())
    uniqueTokens = corpus.map(lambda x : list(set(x[1])))
    tokenCountPairTuple = uniqueTokens.flatMap(lambda tokens : [(t, 1) for t in tokens])
    tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a, b : a + b)
    return (tokenSumPairTuple.map(lambda (t, f) : (t, N / f)))

idfsSmall = idfs(corpusRDD)
uniqueTokenCount = idfsSmall.count()
print 'There are %s unique tokens in the small datasets.' % uniqueTokenCount
print idfsSmall.takeOrdered(5, lambda s: s[1])

10. 定义计算TF-IDF的函数

def tfidf(tokens, idfs):
    tfIdfDict = {}
    tfs = tf(tokens)
    for token in tfs.keys():
        tfIdfDict[token] = tfs[token] * idfs[token]
    return tfIdfDict

recb000hkgj8k = amazonRecToToken.filter(lambda x: x[0] == 'b000hkgj8k').collect()[0][1]
idfsSmallWeights = idfsSmall.collectAsMap()
rec_b000hkgj8k_weights = tfidf(recb000hkgj8k, idfsSmallWeights)
print 'Amazon record "b000hkgj8k" has tokens and weights:\n%s' % rec_b000hkgj8k_weights

11. 定义计算余弦相似度的函数

import math

def dotprod(a, b):
    dp = 0.0
    for k in set(a.keys()).intersection(set(b.keys())):
        dp += a[k] * b[k]
    return dp

def norm(a):
    x = 0.0
    for k in a.keys():
        x += a[k] ** 2
    return math.sqrt(x)

def cossim(a, b):
    return dotprod(a, b) / (norm(a) * norm(b))

testVec1 = {'foo': 2, 'bar': 3, 'baz': 5 }
testVec2 = {'foo': 1, 'bar': 0, 'baz': 20 }
dp = dotprod(testVec1, testVec2)
nm = norm(testVec1)
print dp, nm

12. 定义计算TF-IDF余弦相似度的函数

def cosineSimilarity(string1, string2, idfsDictionary):
    w1 = tfidf(tokenize(string1), idfsDictionary)
    w2 = tfidf(tokenize(string2), idfsDictionary)
    return cossim(w1, w2)

cossimAdobe = cosineSimilarity('Adobe Photoshop',
                               'Adobe Illustrator',
print cossimAdobe

13. 进行实体解析

crossSmall = (googleSmall

def computeSimilarity(record):
    googleRec = record[0]
    amazonRec = record[1]
    googleURL = googleRec[0]
    amazonID = amazonRec[0]
    googleValue = googleRec[1]
    amazonValue = amazonRec[1]
    cs = cosineSimilarity(googleValue, amazonValue, idfsSmallWeights)
    return (googleURL, amazonID, cs)

similarities = (crossSmall
def similar(amazonID, googleURL):
    return (similarities
            .filter(lambda record: (record[0] == googleURL and record[1] == amazonID))

similarityAmazonGoogle = similar('b000o24l3q', 'http://www.google.com/base/feeds/snippets/17242822440574356561')
print 'Requested similarity is %s.' % similarityAmazonGoogle

14. 产品真实匹配数据集

GOLDFILE_PATTERN = '^(.+),(.+)'

def parse_goldfile_line(goldfile_line):
    match = re.search(GOLDFILE_PATTERN, goldfile_line)
    if match is None:
        print 'Invalid goldfile line: %s' % goldfile_line
        return (goldfile_line, -1)
    elif match.group(1) == '"idAmazon"':
        print 'Header datafile line: %s' % goldfile_line
        return (goldfile_line, 0)
        key = '%s %s' % (removeQuotes(match.group(1)), removeQuotes(match.group(2)))
        return ((key, 'gold'), 1)

goldfile = os.path.join(baseDir, inputPath, GOLD_STANDARD_PATH)
gsRaw = (sc
gsFailed = (gsRaw
            .filter(lambda s: s[1] == -1)
            .map(lambda s: s[0]))
for line in gsFailed.take(10):
    print 'Invalid goldfile line: %s' % line

goldStandard = (gsRaw
                .filter(lambda s: s[1] == 1)
                .map(lambda s: s[0])

print 'Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (gsRaw.count(),

15. 统计谷歌和亚马逊小数据集中匹配的产品数

sims = similarities.map(lambda x : (x[1] + ' ' + x[0], x[2]))

trueDupsRDD = (sims
               .map(lambda x : (x[0], x[1][0])))
trueDupsCount = trueDupsRDD.count()
print 'There are %s true duplicates.' % trueDupsCount

16. (练习)统计匹配的产品的平均余弦相似度

17. (练习)统计不匹配的产品的平均余弦相似度

18. (练习)优化算法并尝试亚马逊和谷歌产品标准数据集



1. 定义函数解析电影评分和电影信息数据

def get_ratings_tuple(entry):
    items = entry.split('::')
    return int(items[0]), int(items[1]), float(items[2])

def get_movie_tuple(entry):
    items = entry.split('::')
    return int(items[0]), items[1]

2. 创建电影评分和电影信息数据RDD

import sys
import os

baseDir = os.path.join('/data')
inputPath = os.path.join('12', '5')

ratingsFilename = os.path.join(baseDir, inputPath, 'ratings.dat.gz')
moviesFilename = os.path.join(baseDir, inputPath, 'movies.dat')

numPartitions = 2
rawRatings = sc.textFile(ratingsFilename).repartition(numPartitions)

rawMovies = sc.textFile(moviesFilename)

ratingsRDD = rawRatings.map(get_ratings_tuple).cache()
moviesRDD = rawMovies.map(get_movie_tuple).cache()

ratingsCount = ratingsRDD.count()

moviesCount = moviesRDD.count()

print 'There are %s ratings and %s movies in the datasets' % (ratingsCount, moviesCount)

print 'Ratings: %s' % ratingsRDD.take(3)

print 'Movies: %s' % moviesRDD.take(3)

3. 定义按键值排序函数

def sortFunction(tuple):
    key = unicode('%.3f' % tuple[0])
    value = tuple[1]
    return (key + ' ' + value)

4. 定义计算评分数和平均评分的函数

def getCountsAndAverages(IDandRatingsTuple):
    ratingsCount = len(IDandRatingsTuple[1])
    ratingsSum = float(sum(IDandRatingsTuple[1]))
    return (IDandRatingsTuple[0], (ratingsCount, ratingsSum / ratingsCount))

getCountsAndAverages((1, (1, 2, 3, 4)))

5. 选取评分最高的电影

movieIDsWithRatingsRDD = (ratingsRDD
                          .map(lambda x : (x[1], x[2]))
print 'movieIDsWithRatingsRDD: %s\n' % movieIDsWithRatingsRDD.take(3)

movieIDsWithAvgRatingsRDD = movieIDsWithRatingsRDD.map(getCountsAndAverages)
print 'movieIDsWithAvgRatingsRDD: %s\n' % movieIDsWithAvgRatingsRDD.take(3)

movieNameWithAvgRatingsRDD = (moviesRDD
                              .map(lambda x : (x[1][1][1], x[1][0], x[1][1][0])))
print 'movieNameWithAvgRatingsRDD: %s\n' % movieNameWithAvgRatingsRDD.take(3)

6. 选取有超过500条评分记录评分最高的电影

movieLimitedAndSortedByRatingRDD = (movieNameWithAvgRatingsRDD
                                    .filter(lambda x : x[2] > 500)
                                    .sortBy(sortFunction, False))

print 'Movies with highest ratings: %s' % movieLimitedAndSortedByRatingRDD.take(20)

7. 创建协同过滤推荐的训练集

trainingRDD, validationRDD, testRDD = ratingsRDD.randomSplit([6, 2, 2], seed=0L)

print 'Training: %s, validation: %s, test: %s\n' % (trainingRDD.count(),

8. 定义计算均方根误差的函数

import math

def computeError(predictedRDD, actualRDD):
    predictedReformattedRDD = predictedRDD.map(lambda x : ((x[0], x[1]), x[2]))
    actualReformattedRDD = actualRDD.map(lambda x : ((x[0], x[1]), x[2]))
    squaredErrorsRDD = (predictedReformattedRDD
                        .map(lambda x : (x[1][0] - x[1][1]) ** 2))
    totalError = squaredErrorsRDD.sum()
    numRatings = squaredErrorsRDD.count()
    return math.sqrt(float(totalError) / numRatings)

testPredicted = sc.parallelize([
    (1, 1, 5),
    (1, 2, 3),
    (1, 3, 4),
    (2, 1, 3),
    (2, 2, 2),
    (2, 3, 4)])
testActual = sc.parallelize([
     (1, 2, 3),
     (1, 3, 5),
     (2, 1, 5),
     (2, 2, 1)])
testError = computeError(testPredicted, testActual)

print 'Error for test dataset: %s' % testError

9. 训练预测模型

from pyspark.mllib.recommendation import ALS

validationForPredictRDD = validationRDD.map(lambda x : (x[0], x[1]))

seed = 5L
iterations = 5
regularizationParameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.03

minError = float('inf')
bestRank = -1
bestIteration = -1
for rank in ranks:
    model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations,
    predictedRatingsRDD = model.predictAll(validationForPredictRDD)
    error = computeError(predictedRatingsRDD, validationRDD)
    errors[err] = error
    err += 1
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < minError:
        minError = error
        bestRank = rank

print 'The best model was trained with rank %s' % bestRank

10. 测试预测模型

myModel = ALS.train(trainingRDD, ranks[2], seed=seed, iterations=iterations,

testForPredictingRDD = testRDD.map(lambda x : (x[0], x[1]))
predictedTestRDD = myModel.predictAll(testForPredictingRDD)
testRMSE = computeError(testRDD, predictedTestRDD)
print 'The model had a RMSE on the test set of %s' % testRMSE

11. 比较模型性能

trainingAvgRating = trainingRDD.map(lambda x : x[2]).sum() / float(trainingRDD.map(lambda x : x[2]).count())
print 'The average rating for movies in the training set is %s' % trainingAvgRating

testForAvgRDD = testRDD.map(lambda x : (x[0], x[1], trainingAvgRating))
testAvgRMSE = computeError(testRDD, testForAvgRDD)
print 'The RMSE on the average set is %s' % testAvgRMSE

12. (练习)设计自己的推荐算法