public class LanguageModel implements Serializable {
private static final long serialVersionUID = 1L;
/** The name of the LanguageModel, e.g. German, English etc. */
private String name;
/**
* A hash table to store trigrams. A String object represents the trigram
* itself, it is equal to the value returned by the <tt>getIdentifier()</tt> -
* Method of the TrigramObject
*/
private Hashtable<String, TrigramObject> trigramList;
/**
* A hash table to store bigrams, only used to compute probabilities of
* trigram occurences
*/
private Hashtable<String, ModelComponent> bigramList;
/**
* A hash table to store words. A String object represents the word itself,
* it is equal to the value returned by the <tt>getIdentifier()</tt> -
* Method of the WordObject
*/
private Hashtable<String, WordObject> wordList;
/**
* The number of different characters in the sample text, only used to
* compute probabilities of trigram occurences
*/
private int alphabetSize;
/**
* Constructs a new <tt>LanguageModel</tt> instance with the specified
* name.
* <p>
*
* @param name -
* LanguageModel name string
*/
public LanguageModel(String name) {
this.name = name;
}
/**
* Sets trigram list of the language model
*
* @param trList -
* a hashtable with TrigramObjects
*/
public void setTrigramList(Hashtable<String, TrigramObject> trList) {
trigramList = trList;
}
/**
* Sets bigram list of the language model
*
* @param bList -
* a hashtable with bigrams, which are instances of TrigramObject
* for convinience
*/
public void setBigramList(Hashtable<String, ModelComponent> bList) {
bigramList = bList;
}
/**
* Returns the bigram list of the LanguageModel
*
* @return bigramList - a hash table with bigrams of this LanguageModel
*/
public Hashtable<String, ModelComponent> getBigramList() {
return bigramList;
}
/**
* Sets the size of the alphabet to a given Integer
*
* @param size -
* size of the alphabet
*/
public void setAlphabetSize(int size) {
alphabetSize = size;
}
/**
* Return the alphabet size
*
* @return size - the size of the alphabet
*/
public int getAlphabetSize() {
return alphabetSize;
}
/**
* Sets the wordList of the LanguageModel
*
* @param wList -
* a hash table with stored WordObjects
*/
public void setWordList(Hashtable<String, WordObject> wList) {
wordList = wList;
}
/**
* Returns the hash table with TrigramObjects
*
* @return trigramList - trigram list of the LanguageModel
*/
public Hashtable<String, TrigramObject> getTrigramList() {
return trigramList;
}
/**
* Returns the hash table with WordObjects
*
* @return wordList - word list of the LanguageModel
*/
public Hashtable<String, WordObject> getWordList() {
return wordList;
}
/**
* Sets the name of the LanguageModel
*
* @param name -
* name string
*/
public void setName(String name) {
this.name = name;
}
/**
* Returns the name of the LanguageModel
*
* @return name - the LanguageModel name
*/
public String getName() {
return name;
}
/**
* Sorts instances of LanguageModel stored in a Vector alphabetically by
* their names. Uses the BubbleSort algorithm.
*
* @param lm -
* a vector with LanguageModel objects
*/
public static void sortVector(Vector<LanguageModel> lm) {
for (int i = 0; i < lm.size(); i++) {
for (int j = 0; j < lm.size() - 1 - i; j++) {
LanguageModel lm1 = lm.get(j);
LanguageModel lm2 = lm.get(j + 1);
if (lm1.getName().compareToIgnoreCase(lm2.getName()) > 0) {
lm.setElementAt(lm2, j);
lm.setElementAt(lm1, j + 1);
}
}
}
}
/**
* Computes the cumulative frequency for each word in the word list.
*
* @param hash -
* the word list of the LanguageModel
*
*/
public static void calculateCumFrequency(Hashtable<String, WordObject> hash) {
// convert to ArrayList to use the predifined sorting routine
ArrayList<WordObject> wordArray = new ArrayList<WordObject>(hash
.values());
// sort in reverse order by the number of occurences using
// ModelComponentComparator
Collections
.sort(
wordArray,
(Comparator<? super WordObject>) new ModelComponentComparator());
Iterator<WordObject> it = wordArray.iterator();
BigDecimal bd = new BigDecimal(0);
while (it.hasNext()) {
WordObject element = it.next();
// increase the cumulative frequency by the relative frequency of
// the current word
bd = bd.add(new BigDecimal(element.getRelFrequency()),
new MathContext(5, RoundingMode.HALF_UP));
element.setCumFrequency(bd.doubleValue());
}
}
/**
* Returns a string representation of a collection of WordObject or
* TrigramObject instances.
*
* @param c -
* a Vector or an ArrayList with WordObject or TrigramObject
* instances
* @return string representation of the Collection
*/
public String displayCollection(Collection<? extends ModelComponent> c) {
Iterator it = c.iterator();
String str = "";
while (it.hasNext()) {
str += (it.next()).toString();
str += "\n";
}
return str;
}
/**
* Computes the inverse frequency values for all language models, whether
* for trigrams or for words, depending on the parameter i.
*
* @param languageModels -
* a vector storing LanguageModels
* @param i -
* if set to 1 - compute the inverse frequency for trigrams,
* otherwise for words
*/
public static void calculateInvFrequency(
Vector<LanguageModel> languageModels, int i) {
// TODO Auto-generated method stub
if (i == 1) {
for (int j = 0; j < languageModels.size(); j++) {
Hashtable<String, TrigramObject> hash = languageModels.get(j)
.getTrigramList();
Enumeration<TrigramObject> en = hash.elements();
while (en.hasMoreElements()) {
TrigramObject tr = en.nextElement();
int counter = 1;
for (int k = 0; k < languageModels.size(); k++) {
// each time the trigram is found in a language model
// other than the current one,
// increase the counter by one
if (k != j
&& languageModels.get(k).getTrigramList()
.containsKey(tr.getIdentifier())) {
counter++;
}
}
// Divide the measured frequency(absolute frequency) of the
// trigram by the counter
BigDecimal bd = new BigDecimal(tr.getFrequency());
bd = bd.divide(new BigDecimal(counter), 5,
BigDecimal.ROUND_HALF_UP);
tr.setInvFrequency(bd.doubleValue());
}
}
} else {
// the same for words
for (int j = 0; j < languageModels.size(); j++) {
Hashtable<String, WordObject> hash = languageModels.get(j)
.getWordList();
Enumeration<WordObject> en = hash.elements();
while (en.hasMoreElements()) {
WordObject tr = en.nextElement();
int counter = 1;
for (int k = 0; k < languageModels.size(); k++) {
// each time the word is found in a language model other
// than the current one,
// increase the counter by one
if (k != j
&& languageModels.get(k).getWordList()
.containsKey(tr.getIdentifier())) {
counter++;
}
}
// Divide the measured frequency(absolute frequency) of the
// word by the counter
BigDecimal bd = new BigDecimal(tr.getFrequency());
bd = bd.divide(new BigDecimal(counter), 5,
BigDecimal.ROUND_HALF_UP);
tr.setInvFrequency(bd.doubleValue());
}
}
}
}
}