dependencies {
implementation 'com.twitter.penguin:korean-text:4.4'
}
// Normalize
CharSequence normalized = TwitterKoreanProcessorJava.normalize(dailyChatMessage.getMessage());
// Tokenize
Seq<KoreanTokenizer.KoreanToken> tokens = (Seq<KoreanTokenizer.KoreanToken>) TwitterKoreanProcessorJava.tokenize(normalized);
// Stemming
Seq<KoreanTokenizer.KoreanToken> stemmed = (Seq<KoreanTokenizer.KoreanToken>) TwitterKoreanProcessorJava.stem(tokens);
// ์คํธ๋ง ๋ฆฌ์คํธ [์ค๋, ์ด์ , ์ฌํ๋ค]
List<String> stemmedStringList = TwitterKoreanProcessorJava.tokensToJavaStringList(stemmed);
for (String string : stemmedStringList) {
System.out.println(string);
}
// KoreanTokenJava ๋ฆฌ์คํธ [ํ๊ต(Noun: 15, 2), ๊ฐ๋ค(Verb: 18, 2), ๊ฐ์กฑ(Noun: 0, 2)]
List<KoreanTokenJava> stemmedKokenList = TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed);
for (KoreanTokenJava koreanTokenJava : stemmedKokenList) {
if (koreanTokenJava.getPos().equals(Noun) || koreanTokenJava.getPos().equals(Adjective)) {
System.out.println(koreanTokenJava.getText());
}
}
//์ด๊ตฌ ๋ฆฌ์คํธ
List<KoreanPhraseExtractor.KoreanPhrase> phrases = TwitterKoreanProcessorJava.extractPhrases(tokens, true, true);
for (KoreanPhraseExtractor.KoreanPhrase item : phrases) {
System.out.println(item.text());
}
koreanTokenJava.getPos() : ํ์ฌ๋ฅผ ๊ฐ์ ธ์จ๋ค
ํ์ฌ ์ข ๋ฅ
// Word leved POS
Noun, Verb, Adjective,
Adverb, Determiner, Exclamation,
Josa, Eomi, PreEomi, Conjunction,
NounPrefix, VerbPrefix, Suffix, Unknown,
// Chunk level POS
Korean, Foreign, Number, KoreanParticle, Alpha,
Punctuation, Hashtag, ScreenName,
Email, URL, CashTag,
// Functional POS
Space, Others,
ProperNoun;
java ์์๊ฐ ์์์ด์ ์ ์ด๋ณธ๋ค
๋ฐ์ํ
'๊ฐ๋ฐ > AI' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
๊ฐ์ ๋ถ์ AI (kobert / onnxruntime ์ด์) (0) | 2022.02.20 |
---|