I came up with a method below, the purpose of which is to divide variable-length text into an array of words for further processing of the full text index (removing the stop word, followed by the stemmer). The results seem to be in order, but I would like to hear opinions on how reliable this implementation is against texts in different languages. Could you use regex instead? Please note that I refused to use String.Split () because it would require me to pass a list of all known delimiters, which is exactly what I was trying to avoid when I wrote the function
PS: I can’t use a full-blown full-text search engine like Lucene.Net for several reasons (Silverlight, Overkill for the project area, etc.).
public string[] SplitWords(string Text)
{
bool inWord = !Char.IsSeparator(Text[0]) && !Char.IsControl(Text[0]);
var result = new List<string>();
var sbWord = new StringBuilder();
for (int i = 0; i < Text.Length; i++)
{
Char c = Text[i];
if(!Char.IsSeparator(c) && !Char.IsControl(c))
{
if (!inWord)
{
sbWord = new StringBuilder();
inWord = true;
}
if (!Char.IsPunctuation(c) && !Char.IsSymbol(c))
sbWord.Append(c);
}
else
{
if (inWord)
{
string word = sbWord.ToString();
if (word.Length > 0)
result.Add(word);
sbWord.Clear();
inWord = false;
}
}
}
return result.ToArray();
}
source
share