Is there any stalk for the Indian language

Is there any stalk implementation for Indian languages ​​such as (Hindi, Telugu).

+7
nlp stemming indic
source share
4 answers

The Hindi analyzer , with the help of a stemmer, is available at Lucene. It is based on this algorithm (pdf).

+4
source share

hindi_stemmer is a Python implementation of the source of Hindi described in " Light streamer for Hindi " by Ananthakrishnan Ramanatan and Durgesh D Rao.

+3
source share

We created a java version of the Python Hindi Stemmer source code

Although the authors of the original Hindi-stemmer used the variable L, they are not entirely understood, but there is a complete code that will work

import java.util.ArrayList; import org.apache.commons.lang.StringUtils;

public class SimpleHindiStemmer { /* This is not coming while pasting the code better copy from Python Code as given in suffixes[1] */ private static String [] stem1 = new String [] { "ो", "े", "ू", "ु", "ी", "ि", "ा" }; /* This is not coming while pasting the code better copy from Python Code as given in suffixes[2] */ private static String [] stem2 = new String [] { "कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें" }; private static String [] stem3 = new String [] { "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं" }; private static String [] stem4 = new String [] { "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां" }; private static String [] stem5 = new String [] { "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां" }; private static ArrayList<String []> stemList = new ArrayList<String []>(5); static int [] cut = new int [] { 0, 1, 1, 1, 2, 2 }; static { stemList.add(stem5); stemList.add(stem4); stemList.add(stem3); stemList.add(stem2); stemList.add(stem1); } public SimpleHindiStemmer(){ } public static void main(String [] argv){ SimpleHindiStemmer sm = new SimpleHindiStemmer(); String word = "रास्ते"; System.out.println(sm.stemprocess(word)); } public String stemprocess(String word){ int wlen = word.length(); int wordlen = wlen*3; int icnt = 5; for (String [] stemwords : stemList){ if(wordlen > (icnt + 1)){ for(String sw: stemwords){ if(StringUtils.endsWith(word, sw)){ return StringUtils.substring(word, 0, wlen - cut[icnt]); } } } icnt--; } return word; } } 

As you can see, utf-8 characters are poorly understood in some cases. Take a look at the original python code and just copy the suffix values ​​from there.

0
source share
 import java.util.Map; import java.util.WeakHashMap; 

/ ** * Hindi light stemmer- removes the number, gender and suffix of judgments from nouns and adjectives

 public class HindiStemmerLight{ /** * A cache of words and their stems */ static private Map<String, String> cache = new WeakHashMap<String, String>(); /** * A buffer of the current word being stemmed */ private StringBuilder sb = new StringBuilder(); /** * Default constructor */ public HindiStemmerLight() { } public String stem(String word) { String result = cache.get(word); if (result != null) return result; // sb.delete(0, sb.length()); // sb.append(word); /* remove the case endings from nouns and adjectives */ remove_suffix(sb); result = sb.toString(); cache.put(word, result); return result; } private void remove_suffix(StringBuilder word) { int len = word.length() - 1; /* article */ if (len > 4) { if (word.substring( len- 2, len+1).equals("िया")) { word.delete(len-2 , len + 1); return; } if (word.substring( len- 2, len+1).equals("ियो")) { word.delete(len-2 , len + 1); return; } } /* end if len >4 */ if (len > 3) { if (word.substring(len-1, len+1).equals("ाए")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals(" ाओ")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals(" ुआ")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals(" ुओ")) { word.delete(len - 1, len + 1); return; } if (word.substring( len- 1, len+1).equals("ये")) { word.delete(len-1 , len + 1); return; } if (word.substring(len-1, len+1).equals(" ेन")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals(" ेण")) { word.delete(len - 1, len + 1); return; } if (word.substring( len- 1, len+1).equals(" ीय")) { word.delete(len-1 , len + 1); return; } if (word.substring(len-1, len+1).equals("टी")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals("ार")) { word.delete(len - 1, len + 1); return; } if (word.substring(len-1, len+1).equals("ाई")) { word.delete(len - 1, len + 1); return; } } /* end if len > 3 */ if (len > 2) { if (word.substring(len, len+1).equals(" ा")) { word.delete(len , len + 1); return; } if (word.substring(len, len+1).equals(" े")) { word.delete(len , len + 1); return; } if (word.substring(len, len+1).equals(" ी")) { word.delete(len , len + 1); return; } if (word.substring(len, len+1).equals(" ो")) { word.delete(len , len + 1); return; } if (word.substring(len, len+1).equals("ि ")) { word.delete(len , len + 1); return; } if (word.substring(len, len+1).equals("अ")) { word.delete(len , len + 1); return; } } /* end if len > 2 */ return; } 

}

0
source share

All Articles