This is not entirely true. Mahout has no content recommendations, but he has algorithms for calculating the similarity between content-based elements. One of the most popular is TF-IDF and the similarity to cosine. However, the calculation is not performed on the fly, but is performed offline. You need hadoop to quickly calculate pairwise similarities based on content. The steps I'm going to write are for MAHOUT 0.8. I'm not sure if they changed it to 0.9.
1. seq. MAHOUT-0.8, 0.9 - (, MAHOUT):
$MAHOUT_HOME/bin/mahout seqdirectory
--input <PARENT DIR WHERE DOCS ARE LOCATED> --output <OUTPUT DIRECTORY>
<-c <CHARSET NAME OF THE INPUT DOCUMENTS> {UTF-8|cp1252|ascii...}>
<-chunk <MAX SIZE OF EACH CHUNK in Megabytes> 64>
<-prefix <PREFIX TO ADD TO THE DOCUMENT ID>>
2. :
$MAHOUT_HOME/bin/mahout seq2sparse \
-i <SEQ INPUT DIR> \
-o <VECTORS OUTPUT DIR> \
-ow -chunk 100 \
-wt tfidf \
-x 90 \
-seq \
-ml 50 \
-md 3 \
-n 2 \
-nv \
-Dmapred.map.tasks=1000 -Dmapred.reduce.tasks=1000
:
- chunk - .
- x . -x, .
- wt - .
- md , , . .
- n Lp. 8.4. - . 2 ,
- nv to , .
3. :
$MAHOUT_HOME/bin/mahout rowid -i <VECTORS OUTPUT DIR>/tfidf-vectors/part-r-00000 -o <MATRIX OUTPUT DIR>
4. . 50 .
$MAHOUT_HOME/bin/mahout rowsimilarity -i <MATRIX OUTPUT DIR>/matrix -o <SIMILARITY OUTPUT DIR> -r <NUM OF COLUMNS FROM THE OUTPUT IN STEP 3> --similarityClassname SIMILARITY_COSINE -m 50 -ess -Dmapred.map.tasks=1000 -Dmapred.reduce.tasks=1000
50 .
, , , , . Collection<GenericItemSimilarity.ItemItemSimilarity>. , :
public static Collection<GenericItemSimilarity.ItemItemSimilarity> correlationMatrix(final File folder, TIntLongHashMap docIndex) throws IOException{
Collection<GenericItemSimilarity.ItemItemSimilarity> corrMatrix =
new ArrayList<GenericItemSimilarity.ItemItemSimilarity>();
ItemItemSimilarity itemItemCorrelation = null;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
int n=0;
for (final File fileEntry : folder.listFiles()) {
if (fileEntry.isFile()) {
if(fileEntry.getName().startsWith("part-r")){
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(fileEntry.getAbsolutePath()), conf);
IntWritable key = new IntWritable();
VectorWritable value = new VectorWritable();
while (reader.next(key, value)) {
long itemID1 = docIndex.get(Integer.parseInt(key.toString()));
Iterator<Element> it = value.get().nonZeroes().iterator();
while(it.hasNext()){
Element next = it.next();
long itemID2 = docIndex.get(next.index());
double similarity = next.get();
if (similarity < -1.0) {
similarity = -1.0;
} else if (similarity > 1.0) {
similarity = 1.0;
}
itemItemCorrelation = new GenericItemSimilarity.ItemItemSimilarity(itemID1, itemID2, similarity);
corrMatrix.add(itemItemCorrelation);
}
}
reader.close();
n++;
logger.info("File "+fileEntry.getName()+" readed ("+n+"/"+folder.listFiles().length+")");
}
}
}
return corrMatrix;
}
public static TIntLongHashMap getDocIndex(String docIndex) throws IOException{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
TIntLongHashMap map = new TIntLongHashMap();
SequenceFile.Reader docIndexReader = new SequenceFile.Reader(fs, new Path(docIndex), conf);
IntWritable key = new IntWritable();
Text value = new Text();
while (docIndexReader.next(key, value)) {
map.put(key.get(), Long.parseLong(value.toString()));
}
return map;
}
:
TIntLongHashMap docIndex = ItemPairwiseSimilarityUtil.getDocIndex(filename);
TLongObjectHashMap<TLongDoubleHashMap> correlationMatrix = ItemPairwiseSimilarityUtil.correlatedItems(folder, docIndex);
filename - docIndex, folder - . , , .
, .