Taste 是 Apache Mahout 提供的一个协同过滤算法的高效实现,它是一个基于Java实现的可扩展的高效的推荐引擎。扩展性是指使用hadoop进行mapreduce计算,提高运算性能。
最近开始看源码,分析一下,做个笔记。 ItemSimilarityJob类是mahout使用hadoop做推荐引擎的主要实现类,下面开始分析。
run()函数是启动函数:
public final class RecommenderJob extends AbstractJob { public static final String BOOLEAN_DATA = "booleanData"; private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100; private static final int DEFAULT_MAX_PREFS_PER_USER = 1000; private static final int DEFAULT_MIN_PREFS_PER_USER = 1; @Override public int run(String[] args) throws Exception { //这里原来有大一堆代码,都是用来载入配置项,不用管它 //第一步:准备矩阵,将原始数据转换为一个矩阵,在PreparePreferenceMatrixJob这个类中完成 if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{ "--input", getInputPath().toString(), "--output", prepPath.toString(), "--maxPrefsPerUser", String.valueOf(maxPrefsPerUserInItemSimilarity), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString()}); numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); } //第二步:计算协同矩阵 if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* special behavior if phase 1 is skipped */ if (numberOfUsers == -1) { numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null, getConf()); } /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ //calculate the co-occurrence matrix ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{ "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassname, "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString()}); } //start the multiplication of the co-occurrence matrix by the user vectors if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job prePartialMultiply1 = prepareJob( similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); boolean succeeded = prePartialMultiply1.waitForCompletion(true); if (!succeeded) return -1; //continue the multiplication Job prePartialMultiply2 = prepareJob(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); if (usersFile != null) { prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile); } prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser); succeeded = prePartialMultiply2.waitForCompletion(true); if (!succeeded) return -1; //finish the job Job partialMultiply = prepareJob( new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); setS3SafeCombinedInputPath(partialMultiply, getTempPath(), prePartialMultiplyPath1, prePartialMultiplyPath2); succeeded = partialMultiply.waitForCompletion(true); if (!succeeded) return -1; } if (shouldRunNextPhase(parsedArgs, currentPhase)) { //filter out any users we don't care about /* convert the user/item pairs to filter if a filterfile has been specified */ if (filterFile != null) { Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); boolean succeeded = itemFiltering.waitForCompletion(true); if (!succeeded) return -1; } String aggregateAndRecommendInput = partialMultiplyPath.toString(); if (filterFile != null) { aggregateAndRecommendInput += "," + explicitFilterPath; } //extract out the recommendations Job aggregateAndRecommend = prepareJob( new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class); Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); if (itemsFile != null) { aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); } if (filterFile != null) { setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath); } setIOSort(aggregateAndRecommend); aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); boolean succeeded = aggregateAndRecommend.waitForCompletion(true); if (!succeeded) return -1; } return 0; }