diff --git a/korea_nlp/korea_nlp_komoran.py b/korea_nlp/korea_nlp_komoran.py index 3c553ed4653e6681aca75df07196b45b2359559a..a975c0c5394a92df517b5bab4056f828c68263d1 100644 --- a/korea_nlp/korea_nlp_komoran.py +++ b/korea_nlp/korea_nlp_komoran.py @@ -35,4 +35,32 @@ class Tokenizer: diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) noun_list = list(set(noun_list) - set(diff_noun_list)) - return noun_listd \ No newline at end of file + return noun_list + + def noun_extract_dup(self, sentence, score_dic): + scores = score_dic + tokenizer = MaxScoreTokenizer(scores=scores) + token = tokenizer.tokenize(sentence) + noun_list = [] + compared_noun_list = self.t.nouns(sentence) + + for num, input in enumerate(token): + if (token[num] in scores) == True: + noun_list.append(token[num]) + elif (token[num] in scores) == False: + twit_token = self.t.nouns(token[num]) + noun_list= noun_list + twit_token + + diff_noun_list = list(set(noun_list) - set(compared_noun_list)) + diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) + noun_list = list(set(noun_list) - set(diff_noun_list)) + return noun_list + + def noun_counter(self, sentence, score_dic, word): + noun_list = self.noun_extract(sentence,score_dic) + number = 0 + for num, input in enumerate(noun_list): + if input == word: + number = number + 1 + + return number \ No newline at end of file