We created a java version of the Python Hindi Stemmer source code
Although the authors of the original Hindi-stemmer used the variable L, they are not entirely understood, but there is a complete code that will work
import java.util.ArrayList; import org.apache.commons.lang.StringUtils;
public class SimpleHindiStemmer { private static String [] stem1 = new String [] { "ो", "े", "ू", "ु", "ी", "ि", "ा" }; private static String [] stem2 = new String [] { "कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें" }; private static String [] stem3 = new String [] { "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं" }; private static String [] stem4 = new String [] { "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां" }; private static String [] stem5 = new String [] { "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां" }; private static ArrayList<String []> stemList = new ArrayList<String []>(5); static int [] cut = new int [] { 0, 1, 1, 1, 2, 2 }; static { stemList.add(stem5); stemList.add(stem4); stemList.add(stem3); stemList.add(stem2); stemList.add(stem1); } public SimpleHindiStemmer(){ } public static void main(String [] argv){ SimpleHindiStemmer sm = new SimpleHindiStemmer(); String word = "रास्ते"; System.out.println(sm.stemprocess(word)); } public String stemprocess(String word){ int wlen = word.length(); int wordlen = wlen*3; int icnt = 5; for (String [] stemwords : stemList){ if(wordlen > (icnt + 1)){ for(String sw: stemwords){ if(StringUtils.endsWith(word, sw)){ return StringUtils.substring(word, 0, wlen - cut[icnt]); } } } icnt--; } return word; } }
As you can see, utf-8 characters are poorly understood in some cases. Take a look at the original python code and just copy the suffix values from there.