/** * @author Dolamic Ljiljana University of Neuchatel * * Czech stemmer-removes case endings form nouns and adjetives, possesive adj. * endings from names * and takes care of palatalisation */ public class CzechStemmerLight { /** * A buffer of the current word being stemmed */ private StringBuffer sb=new StringBuffer(); /** * Default constructor */ public CzechStemmerLight(){} // constructor public String stem(String input){ // input=input.toLowerCase(); //reset string buffer sb.delete(0,sb.length()); sb.insert(0,input); // stemming... //removes case endings from nouns and adjectives removeCase(sb); //removes possesive endings from names -ov- and -in- removePossessives(sb); result = sb.toString(); return result; } private void palatalise(StringBuffer buffer){ int len=buffer.length(); if( buffer.substring( len- 2 ,len).equals("ci")|| buffer.substring( len- 2 ,len).equals("ce")|| buffer.substring( len- 2 ,len).equals("\u010di")|| //-či buffer.substring( len- 2 ,len).equals("\u010de")){ //-č buffer.replace(len- 2 ,len, "k"); return; } if( buffer.substring( len- 2 ,len).equals("zi")|| buffer.substring( len- 2 ,len).equals("ze")|| buffer.substring( len- 2 ,len).equals("\u017ei")|| //-ži buffer.substring( len- 2 ,len).equals("\u017ee")){ //-že buffer.replace(len- 2 ,len, "h"); return; } if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")|| //-čtě buffer.substring( len- 3 ,len).equals("\u010dti")|| //-čti buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){ //-čté buffer.replace(len- 3 ,len, "ck"); return; } if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")|| //-ště buffer.substring( len- 2 ,len).equals("\u0161ti")|| //-šti buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){ //-šté buffer.replace(len- 2 ,len, "sk"); return; } buffer.delete( len- 1 , len); return; }//palatalise private void removePossessives(StringBuffer buffer) { int len=buffer.length(); if( len> 5 ){ if( buffer.substring( len- 2 ,len).equals("ov")){ buffer.delete( len- 2 , len); return; } if( buffer.substring( len-2,len).equals("\u016fv")){ //-ův buffer.delete( len- 2 , len); return; } if( buffer.substring( len- 2 ,len).equals("in")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } } return; }//removePossessives private void removeCase(StringBuffer buffer) { int len=buffer.length(); // if( (len> 7 )&& buffer.substring( len- 5 ,len).equals("atech")){ buffer.delete( len- 5 , len); return; }//len>7 if( len> 6 ){ if(buffer.substring( len- 4 ,len).equals("\u011btem")){ //-ětem buffer.delete( len- 3 , len); palatalise(buffer); return; } if(buffer.substring( len- 4 ,len).equals("at\u016fm")){ //-atům buffer.delete( len- 4 , len); return; } } if( len> 5 ){ if(buffer.substring( len-3,len).equals("ech")|| buffer.substring( len-3,len).equals("ich")|| buffer.substring( len-3,len).equals("\u00edch")){ //-ích buffer.delete( len-2 , len); palatalise(buffer); return; } if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého buffer.substring( len-3,len).equals("\u011bmi")|| //-ěmi buffer.substring( len-3,len).equals("emi")|| buffer.substring( len-3,len).equals("\u00e9mu")|| //ému buffer.substring( len-3,len).equals("\u011bte")|| //-ěte buffer.substring( len-3,len).equals("\u011bti")|| //-ěti buffer.substring( len-3,len).equals("iho")|| buffer.substring( len-3,len).equals("\u00edho")|| //-ího buffer.substring( len-3,len).equals("\u00edmi")|| //-ími buffer.substring( len-3,len).equals("imu")){ buffer.delete( len- 2 , len); palatalise(buffer); return; } if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách buffer.substring( len-3,len).equals("ata")|| buffer.substring( len-3,len).equals("aty")|| buffer.substring( len-3,len).equals("\u00fdch")|| //-ých buffer.substring( len-3,len).equals("ama")|| buffer.substring( len-3,len).equals("ami")|| buffer.substring( len-3,len).equals("ov\u00e9")|| //-ové buffer.substring( len-3,len).equals("ovi")|| buffer.substring( len-3,len).equals("\u00fdmi")){ //-ými buffer.delete( len- 3 , len); return; } } if( len> 4){ if(buffer.substring( len-2,len).equals("em")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } if( buffer.substring( len-2,len).equals("es")|| buffer.substring( len-2,len).equals("\u00e9m")|| //-ém buffer.substring( len-2,len).equals("\u00edm")){ //-ím buffer.delete( len- 2 , len); palatalise(buffer); return; } if( buffer.substring( len-2,len).equals("\u016fm")){ //-ům buffer.delete( len- 2 , len); return; } if( buffer.substring( len-2,len).equals("at")|| buffer.substring( len-2,len).equals("\u00e1m")|| //-ám buffer.substring( len-2,len).equals("os")|| buffer.substring( len-2,len).equals("us")|| buffer.substring( len-2,len).equals("\u00fdm")|| //-ým buffer.substring( len-2,len).equals("mi")|| buffer.substring( len-2,len).equals("ou")){ buffer.delete( len- 2 , len); return; } }//len>4 if( len> 3){ if( buffer.substring( len-1,len).equals("e")|| buffer.substring( len-1,len).equals("i")){ palatalise(buffer); return; } if( buffer.substring( len-1,len).equals("\u00ed")|| //-í buffer.substring( len-1,len).equals("\u011b")){ //-ě palatalise(buffer); return; } if( buffer.substring( len-1,len).equals("u")|| buffer.substring( len-1,len).equals("y")|| buffer.substring( len-1,len).equals("\u016f")){ //-ů buffer.delete( len- 1 , len); return; } if( buffer.substring( len-1,len).equals("a")|| buffer.substring( len-1,len).equals("o")|| buffer.substring( len-1,len).equals("\u00e1")|| // -á buffer.substring( len-1,len).equals("\u00e9")|| //-é buffer.substring( len-1,len).equals("\u00fd")){ //-ý buffer.delete( len- 1 , len); return; } }//len>3 } }//CzechStemmer_1