Lucine ist ein Full-Text-Search Framework für Datenbanken.
Wieso sollte man damit einen Text tokenizen?
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.5.0</version>
</dependency>
public static void lucene(String text) throws Exception {
GermanAnalyzer analyzer = new GermanAnalyzer(Version.LUCENE_35);
TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
System.out.println(cattr.toString());
PorterStemmer stemmer = new PorterStemmer();
stemmer.setCurrent(cattr.toString());
stemmer.stem();
System.out.println(stemmer.getCurrent());
}
stream.end();
stream.close();
}