/** * @author Ljiljana Dolamic University of Neuchatel * -removes case endings form nouns and adjectives, possesive adj. endings from names, * diminutive, augmentative, comparative sufixes and derivational sufixes from nouns, * takes care of palatalisation */ public class CzechStemmerAgressive { /** * A buffer of the current word being stemmed */ private StringBuffer sb=new StringBuffer(); /** * Default constructor */ public CzechStemmerAgressive(){} // constructor public String stem(String input){ // input=input.toLowerCase(); //reset string buffer sb.delete(0,sb.length()); sb.insert(0,input); // stemming... //removes case endings from nouns and adjectives removeCase(sb); //removes possesive endings from names -ov- and -in- removePossessives(sb); //removes comparative endings removeComparative(sb); //removes diminutive endings removeDiminutive(sb); //removes augmentatives endings removeAugmentative(sb); //removes derivational sufixes from nouns removeDerivational(sb); result = sb.toString(); return result; } private void removeDerivational(StringBuffer buffer) { int len=buffer.length(); // if( (len > 8 )&& buffer.substring( len-6 ,len).equals("obinec")){ buffer.delete( len- 6 , len); return; }//len >8 if(len > 7){ if(buffer.substring( len-5 ,len).equals("ion\u00e1\u0159")){ // -ionář buffer.delete( len- 4 , len); palatalise(buffer); return; } if(buffer.substring( len-5 ,len).equals("ovisk")|| buffer.substring( len-5 ,len).equals("ovstv")|| buffer.substring( len-5 ,len).equals("ovi\u0161t")|| //-ovišt buffer.substring( len-5 ,len).equals("ovn\u00edk")){ //-ovník buffer.delete( len- 5 , len); return; } }//len>7 if(len > 6){ if( buffer.substring( len-4 ,len).equals("\u00e1sek")|| // -ásek buffer.substring( len-4 ,len).equals("loun")|| buffer.substring( len-4 ,len).equals("nost")|| buffer.substring( len-4 ,len).equals("teln")|| buffer.substring( len-4 ,len).equals("ovec")|| buffer.substring( len-5 ,len).equals("ov\u00edk")|| //-ovík buffer.substring( len-4 ,len).equals("ovtv")|| buffer.substring( len-4 ,len).equals("ovin")|| buffer.substring( len-4 ,len).equals("\u0161tin")){ //-štin buffer.delete( len- 4 , len); return; } if(buffer.substring( len-4 ,len).equals("enic")|| buffer.substring( len-4 ,len).equals("inec")|| buffer.substring( len-4 ,len).equals("itel")){ buffer.delete( len- 3 , len); palatalise(buffer); return; } }//len>6 if(len > 5){ if(buffer.substring( len-3 ,len).equals("\u00e1rn")){ //-árn buffer.delete( len- 3 , len); return; } if(buffer.substring( len-3 ,len).equals("\u011bnk")){ //-ěnk buffer.delete( len- 2 , len); palatalise(buffer); return; } if(buffer.substring( len-3 ,len).equals("i\u00e1n")|| //-ián buffer.substring( len-3 ,len).equals("ist")|| buffer.substring( len-3 ,len).equals("isk")|| buffer.substring( len-3 ,len).equals("i\u0161t")|| //-išt buffer.substring( len-3 ,len).equals("itb")|| buffer.substring( len-3 ,len).equals("\u00edrn")){ //-írn buffer.delete( len- 2 , len); palatalise(buffer); return; } if(buffer.substring( len-3 ,len).equals("och")|| buffer.substring( len-3 ,len).equals("ost")|| buffer.substring( len-3 ,len).equals("ovn")|| buffer.substring( len-3 ,len).equals("oun")|| buffer.substring( len-3 ,len).equals("out")|| buffer.substring( len-3 ,len).equals("ou\u0161")){ //-ouš buffer.delete( len- 3 , len); return; } if(buffer.substring( len-3 ,len).equals("u\u0161k")){ //-ušk buffer.delete( len- 3 , len); return; } if(buffer.substring( len-3 ,len).equals("kyn")|| buffer.substring( len-3 ,len).equals("\u010dan")|| //-čan buffer.substring( len-3 ,len).equals("k\u00e1\u0159")|| //kář buffer.substring( len-3 ,len).equals("n\u00e9\u0159")|| //néř buffer.substring( len-3 ,len).equals("n\u00edk")|| //-ník buffer.substring( len-3 ,len).equals("ctv")|| buffer.substring( len-3 ,len).equals("stv")){ buffer.delete( len- 3 , len); return; } }//len>5 if(len > 4){ if(buffer.substring( len-2 ,len).equals("\u00e1\u010d")|| // -áč buffer.substring( len-2 ,len).equals("a\u010d")|| //-ač buffer.substring( len-2 ,len).equals("\u00e1n")|| //-án buffer.substring( len-2 ,len).equals("an")|| buffer.substring( len-2 ,len).equals("\u00e1\u0159")|| //-ář buffer.substring( len-2 ,len).equals("as")){ buffer.delete( len- 2 , len); return; } if(buffer.substring( len-2 ,len).equals("ec")|| buffer.substring( len-2 ,len).equals("en")|| buffer.substring( len-2 ,len).equals("\u011bn")|| //-ěn buffer.substring( len-2 ,len).equals("\u00e9\u0159")){ //-éř buffer.delete( len-1 , len); palatalise(buffer); return; } if(buffer.substring( len-2 ,len).equals("\u00ed\u0159")|| //-íř buffer.substring( len-2 ,len).equals("ic")|| buffer.substring( len-2 ,len).equals("in")|| buffer.substring( len-2 ,len).equals("\u00edn")|| //-ín buffer.substring( len-2 ,len).equals("it")|| buffer.substring( len-2 ,len).equals("iv")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } if(buffer.substring( len-2 ,len).equals("ob")|| buffer.substring( len-2 ,len).equals("ot")|| buffer.substring( len-2 ,len).equals("ov")|| buffer.substring( len-2 ,len).equals("o\u0148")){ //-oň buffer.delete( len- 2 , len); return; } if(buffer.substring( len-2 ,len).equals("ul")){ buffer.delete( len- 2 , len); return; } if(buffer.substring( len-2 ,len).equals("yn")){ buffer.delete( len- 2 , len); return; } if(buffer.substring( len-2 ,len).equals("\u010dk")|| //-čk buffer.substring( len-2 ,len).equals("\u010dn")|| //-čn buffer.substring( len-2 ,len).equals("dl")|| buffer.substring( len-2 ,len).equals("nk")|| buffer.substring( len-2 ,len).equals("tv")|| buffer.substring( len-2 ,len).equals("tk")|| buffer.substring( len-2 ,len).equals("vk")){ buffer.delete( len-2 , len); return; } }//len>4 if(len > 3){ if(buffer.charAt(buffer.length()-1)=='c'|| buffer.charAt(buffer.length()-1)=='\u010d'|| //-č buffer.charAt(buffer.length()-1)=='k'|| buffer.charAt(buffer.length()-1)=='l'|| buffer.charAt(buffer.length()-1)=='n'|| buffer.charAt(buffer.length()-1)=='t'){ buffer.delete( len-1 , len); } }//len>3 }//removeDerivational private void removeAugmentative(StringBuffer buffer) { int len=buffer.length(); // if( (len> 6 )&& buffer.substring( len- 4 ,len).equals("ajzn")){ buffer.delete( len- 4 , len); return; } if( (len> 5 )&& (buffer.substring( len- 3 ,len).equals("izn")|| buffer.substring( len- 3 ,len).equals("isk"))){ buffer.delete( len- 2 , len); palatalise(buffer); return; } if( (len> 4 )&& buffer.substring( len- 2 ,len).equals("\00e1k")){ //-ák buffer.delete( len- 2 , len); return; } } private void removeDiminutive(StringBuffer buffer) { int len=buffer.length(); // if( (len> 7 )&& buffer.substring( len- 5 ,len).equals("ou\u0161ek")){ //-oušek buffer.delete( len- 5 , len); return; } if( len> 6){ if(buffer.substring( len-4,len).equals("e\u010dek")|| //-eček buffer.substring( len-4,len).equals("\u00e9\u010dek")|| //-éček buffer.substring( len-4,len).equals("i\u010dek")|| //-iček buffer.substring( len-4,len).equals("\u00ed\u010dek")|| //íček buffer.substring( len-4,len).equals("enek")|| buffer.substring( len-4,len).equals("\u00e9nek")|| //-ének buffer.substring( len-4,len).equals("inek")|| buffer.substring( len-4,len).equals("\u00ednek")){ //-ínek buffer.delete( len- 3 , len); palatalise(buffer); return; } if( buffer.substring( len-4,len).equals("\u00e1\u010dek")|| //áček buffer.substring( len-4,len).equals("a\u010dek")|| //aček buffer.substring( len-4,len).equals("o\u010dek")|| //oček buffer.substring( len-4,len).equals("u\u010dek")|| //uček buffer.substring( len-4,len).equals("anek")|| buffer.substring( len-4,len).equals("onek")|| buffer.substring( len-4,len).equals("unek")|| buffer.substring( len-4,len).equals("\u00e1nek")){ //-ánek buffer.delete( len- 4 , len); return; } }//len>6 if( len> 5){ if(buffer.substring( len-3,len).equals("e\u010dk")|| //-ečk buffer.substring( len-3,len).equals("\u00e9\u010dk")|| //-éčk buffer.substring( len-3,len).equals("i\u010dk")|| //-ičk buffer.substring( len-3,len).equals("\u00ed\u010dk")|| //-íčk buffer.substring( len-3,len).equals("enk")|| //-enk buffer.substring( len-3,len).equals("\u00e9nk")|| //-énk buffer.substring( len-3,len).equals("ink")|| //-ink buffer.substring( len-3,len).equals("\u00ednk")){ //-ínk buffer.delete( len- 3 , len); palatalise(buffer); return; } if(buffer.substring( len-3,len).equals("\u00e1\u010dk")|| //-áčk buffer.substring( len-3,len).equals("au010dk")|| //-ačk buffer.substring( len-3,len).equals("o\u010dk")|| //-očk buffer.substring( len-3,len).equals("u\u010dk")|| //-učk buffer.substring( len-3,len).equals("ank")|| buffer.substring( len-3,len).equals("onk")|| buffer.substring( len-3,len).equals("unk")){ buffer.delete( len- 3 , len); return; } if(buffer.substring( len-3,len).equals("\u00e1tk")|| //-átk buffer.substring( len-3,len).equals("\u00e1nk")|| //-ánk buffer.substring( len-3,len).equals("u\u0161k")){ //-ušk buffer.delete( len- 3 , len); return; } }//len>5 if( len> 4){ if(buffer.substring( len-2,len).equals("ek")|| buffer.substring( len-2,len).equals("\u00e9k")|| //-ék buffer.substring( len-2,len).equals("\u00edk")|| //-ík buffer.substring( len-2,len).equals("ik")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } if(buffer.substring( len-2,len).equals("\u00e1k")|| //-ák buffer.substring( len-2,len).equals("ak")|| buffer.substring( len-2,len).equals("ok")|| buffer.substring( len-2,len).equals("uk")){ buffer.delete( len- 1 , len); return; } } if( (len> 3 )&& buffer.substring( len- 1 ,len).equals("k")){ buffer.delete( len- 1, len); return; } }//removeDiminutives private void removeComparative(StringBuffer buffer) { int len=buffer.length(); // if( (len> 5)&& (buffer.substring( len-3,len).equals("ej\u0161")|| //-ejš buffer.substring( len-3,len).equals("\u011bj\u0161"))){ //-ějš buffer.delete( len- 2 , len); palatalise(buffer); return; } } private void palatalise(StringBuffer buffer){ int len=buffer.length(); if( buffer.substring( len- 2 ,len).equals("ci")|| buffer.substring( len- 2 ,len).equals("ce")|| buffer.substring( len- 2 ,len).equals("\u010di")|| //-či buffer.substring( len- 2 ,len).equals("\u010de")){ //-če buffer.replace(len- 2 ,len, "k"); return; } if( buffer.substring( len- 2 ,len).equals("zi")|| buffer.substring( len- 2 ,len).equals("ze")|| buffer.substring( len- 2 ,len).equals("\u017ei")|| //-ži buffer.substring( len- 2 ,len).equals("\u017ee")){ //-že buffer.replace(len- 2 ,len, "h"); return; } if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")|| //-čtě buffer.substring( len- 3 ,len).equals("\u010dti")|| //-čti buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){ //-čtí buffer.replace(len- 3 ,len, "ck"); return; } if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")|| //-ště buffer.substring( len- 2 ,len).equals("\u0161ti")|| //-šti buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){ //-ští buffer.replace(len- 2 ,len, "sk"); return; } buffer.delete( len- 1 , len); return; }//palatalise private void removePossessives(StringBuffer buffer) { int len=buffer.length(); if( len> 5 ){ if( buffer.substring( len- 2 ,len).equals("ov")){ buffer.delete( len- 2 , len); return; } if(buffer.substring( len-2,len).equals("\u016fv")){ //-ův buffer.delete( len- 2 , len); return; } if( buffer.substring( len- 2 ,len).equals("in")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } } }//removePossessives private void removeCase(StringBuffer buffer) { int len=buffer.length(); // if( (len> 7 )&& buffer.substring( len- 5 ,len).equals("atech")){ buffer.delete( len- 5 , len); return; }//len>7 if( len> 6 ){ if(buffer.substring( len- 4 ,len).equals("\u011btem")){ //-ětem buffer.delete( len- 3 , len); palatalise(buffer); return; } if(buffer.substring( len- 4 ,len).equals("at\u016fm")){ //-atům buffer.delete( len- 4 , len); return; } } if( len> 5 ){ if(buffer.substring( len-3,len).equals("ech")|| buffer.substring( len-3,len).equals("ich")|| buffer.substring( len-3,len).equals("\u00edch")){ //-ích buffer.delete( len-2 , len); palatalise(buffer); return; } if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého buffer.substring( len-3,len).equals("\u011bmi")|| //-ěmu buffer.substring( len-3,len).equals("emi")|| buffer.substring( len-3,len).equals("\u00e9mu")|| // -ému buffer.substring( len-3,len).equals("ete")|| buffer.substring( len-3,len).equals("eti")|| buffer.substring( len-3,len).equals("iho")|| buffer.substring( len-3,len).equals("\u00edho")|| //-ího buffer.substring( len-3,len).equals("\u00edmi")|| //-ími buffer.substring( len-3,len).equals("imu")){ buffer.delete( len- 2 , len); palatalise(buffer); return; } if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách buffer.substring( len-3,len).equals("ata")|| buffer.substring( len-3,len).equals("aty")|| buffer.substring( len-3,len).equals("\u00fdch")|| //-ých buffer.substring( len-3,len).equals("ama")|| buffer.substring( len-3,len).equals("ami")|| buffer.substring( len-3,len).equals("ov\u00e9")|| //-ové buffer.substring( len-3,len).equals("ovi")|| buffer.substring( len-3,len).equals("\u00fdmi")){ //-ými buffer.delete( len- 3 , len); return; } } if( len> 4){ if(buffer.substring( len-2,len).equals("em")){ buffer.delete( len- 1 , len); palatalise(buffer); return; } if( buffer.substring( len-2,len).equals("es")|| buffer.substring( len-2,len).equals("\u00e9m")|| //-ém buffer.substring( len-2,len).equals("\u00edm")){ //-ím buffer.delete( len- 2 , len); palatalise(buffer); return; } if( buffer.substring( len-2,len).equals("\u016fm")){ buffer.delete( len- 2 , len); return; } if( buffer.substring( len-2,len).equals("at")|| buffer.substring( len-2,len).equals("\u00e1m")|| //-ám buffer.substring( len-2,len).equals("os")|| buffer.substring( len-2,len).equals("us")|| buffer.substring( len-2,len).equals("\u00fdm")|| //-ým buffer.substring( len-2,len).equals("mi")|| buffer.substring( len-2,len).equals("ou")){ buffer.delete( len- 2 , len); return; } }//len>4 if( len> 3){ if(buffer.substring( len-1,len).equals("e")|| buffer.substring( len-1,len).equals("i")){ palatalise(buffer); return; } if(buffer.substring( len-1,len).equals("\u00ed")|| //-é buffer.substring( len-1,len).equals("\u011b")){ //-ě palatalise(buffer); return; } if( buffer.substring( len-1,len).equals("u")|| buffer.substring( len-1,len).equals("y")|| buffer.substring( len-1,len).equals("\u016f")){ //-ů buffer.delete( len- 1 , len); return; } if( buffer.substring( len-1,len).equals("a")|| buffer.substring( len-1,len).equals("o")|| buffer.substring( len-1,len).equals("\u00e1")|| // -á buffer.substring( len-1,len).equals("\u00e9")|| //-é buffer.substring( len-1,len).equals("\u00fd")){ //-ý buffer.delete( len- 1 , len); return; } }//len>3 } }