How can I implement the similarity of tf-idf and cosine in Lucene?

How can I implement the similarity of tf-idf and cosine in Lucene? I am using Lucene 4.2. The program I created does not use tf-idf and cosine similaryty, it uses only TopScoreDocCollector.

import com.mysql.jdbc.Statement; import java.io.BufferedReader; import java.io.File; import java.io.InputStreamReader; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriter; import java.sql.DriverManager; import java.sql.Connection; import java.sql.ResultSet; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; public class IndexMysqlDBStemming { public static void main(String[] args) throws Exception { // 1. Create Index From Database Class.forName("com.mysql.jdbc.Driver").newInstance(); Connection connection = DriverManager.getConnection("jdbc:mysql://localhost/db_haiquran", "root", ""); IndonesianAnalyzer analyzer = new IndonesianAnalyzer(Version.LUCENE_42); //StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); QueryParser parser = new QueryParser(Version.LUCENE_42, "result", analyzer); Directory INDEX_DIR = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, analyzer); IndexWriter writer = new IndexWriter(INDEX_DIR, config); String query = "SELECT * FROM ayat"; java.sql.Statement statement = connection.createStatement(); ResultSet result = statement.executeQuery(query); while (result.next()) { Document document = new Document(); document.add(new Field("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("NO_SURAT", result.getString("NO_SURAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("NO_AYAT", result.getString("NO_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("TEXT_INDO", result.getString("TEXT_INDO"), Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("TEXT_ARAB", result.getString("TEXT_ARAB"), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.updateDocument(new Term("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT")), document); } writer.close(); // 2. Query System.out.println("Enter your search keyword in here : "); BufferedReader bufferRead = new BufferedReader(new InputStreamReader(System.in)); String s = bufferRead.readLine(); String querystr = args.length > 0 ? args[0] :s; try { System.out.println(parser.parse(querystr)+"\n"); //amenit System.out.println(); } catch (ParseException ex) { // Exception } Query q = new QueryParser(Version.LUCENE_42, "TEXT_INDO", analyzer).parse(querystr); // 3. Search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(INDEX_DIR); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. Display results System.out.println("Found : " + hits.length + " hits."); System.out.println("No" + " ID " + "\t" + " Surat " + "\t" + " No Ayat " + "\t" + " Terjemahan Ayat " + "\t" + " Teks Arab "); for (int i=0; i<hits.length; i++) { int docID = hits[i].doc; Document d = searcher.doc(docID); System.out.println((i+1) + ". " + d.get("NO_INDEX_AYAT") + "\t" + d.get("NO_SURAT") + "\t" + d.get("NO_AYAT")+ "\t" + d.get("TEXT_INDO") + "\t" + d.get("TEXT_ARAB")); } reader.close(); } } 

How can I display the calculation results using tf-idf and cosine?

+4
source share
1 answer

If something is missing me, you have already done. Well done!

By default, the DefaultSimilarity similarity algorithm is used, but most of the documentation (and logic) you will find in it the base class TFIDFSimilarity .

And the TFIDFS-like is indeed an implementation of the TF-IDF and cosine conformance assessment model.

+4
source

All Articles