Skip to content
Snippets Groups Projects
Commit 95438c6a authored by kimsumin's avatar kimsumin
Browse files

Upload code file

parent 537bf58f
Branches
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
```
%% Cell type:code id: tags:
```
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
```
%% Cell type:code id: tags:
```
print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력
print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력
```
%% Output
훈련용 리뷰 개수 : 150000
테스트용 리뷰 개수 : 50000
%% Cell type:code id: tags:
```
```
%% Cell type:code id: tags:
```
train_data[:5] # 상위 5개 출력
```
%% Output
id document label
0 9976970 아 더빙.. 진짜 짜증나네요 목소리 0
1 3819312 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 1
2 10265843 너무재밓었다그래서보는것을추천한다 0
3 9045019 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 0
4 6483659 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... 1
%% Cell type:code id: tags:
```
train_data['document'].nunique(), train_data['label'].nunique()
```
%% Output
(146182, 2)
%% Cell type:code id: tags:
```
train_data.drop_duplicates(subset=['document'], inplace=True)
```
%% Cell type:code id: tags:
```
print(test_data.isnull().values.any())
```
%% Cell type:code id: tags:
```
train_data = train_data.dropna(how = 'any')
```
%% Cell type:code id: tags:
```
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# 한글과 공백을 제외하고 모두 제거
train_data[:5]
```
%% Output
id document label
0 9976970 아 더빙 진짜 짜증나네요 목소리 0
1 3819312 흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나 1
2 10265843 너무재밓었다그래서보는것을추천한다 0
3 9045019 교도소 이야기구먼 솔직히 재미는 없다평점 조정 0
4 6483659 사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던... 1
%% Cell type:code id: tags:
```
train_data['document'].replace('', np.nan, inplace=True)
train_data = train_data.dropna(how = 'any')
print('전처리 후 훈련용 샘플의 개수 :',len(train_data))
```
%% Output
145791
%% Cell type:code id: tags:
```
test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))
```
%% Output
전처리 후 테스트용 샘플의 개수 : 48995
%% Cell type:code id: tags:
```
!pip install konlpy
```
%% Output
Collecting konlpy
[?25l Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
 |████████████████████████████████| 19.4MB 1.3MB/s
[?25hCollecting beautifulsoup4==4.6.0
[?25l Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
 |████████████████████████████████| 92kB 8.1MB/s
[?25hRequirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.19.4)
Collecting JPype1>=0.7.0
[?25l Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e31d351e8278a8a63c0/JPype1-1.2.0-cp36-cp36m-manylinux2010_x86_64.whl (453kB)
 |████████████████████████████████| 460kB 42.1MB/s
[?25hRequirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)
Collecting colorama
Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting tweepy>=3.7.0
Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Requirement already satisfied: typing-extensions; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.3)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)
Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.12.5)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == "socks" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)
Installing collected packages: beautifulsoup4, JPype1, colorama, tweepy, konlpy
Found existing installation: beautifulsoup4 4.6.3
Uninstalling beautifulsoup4-4.6.3:
Successfully uninstalled beautifulsoup4-4.6.3
Found existing installation: tweepy 3.6.0
Uninstalling tweepy-3.6.0:
Successfully uninstalled tweepy-3.6.0
Successfully installed JPype1-1.2.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2 tweepy-3.9.0
%% Cell type:code id: tags:
```
from konlpy.tag import Okt
```
%% Cell type:code id: tags:
```
okt = Okt()
print(train_data['document'][1])
okt.morphs(train_data['document'][1], stem = True)
```
%% Output
흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나
['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다']
%% Cell type:code id: tags:
```
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
X_train = []
for sentence in train_data['document']:
temp_X = []
temp_X = okt.morphs(sentence, stem=True) # 토큰화
temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거a
X_train.append(temp_X)
X_test = []
for sentence in test_data['document']:
temp_X = []
temp_X = okt.morphs(sentence, stem=True) # 토큰화
temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
X_test.append(temp_X)
```
%% Cell type:code id: tags:
```
print(X_train[:3])
```
%% Output
[[51, 455, 17, 261, 660], [934, 458, 42, 603, 2, 215, 1450, 25, 962, 676, 20], [387, 2445, 1, 2316, 5672, 3, 223, 10]]
%% Cell type:code id: tags:
```
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
print(tokenizer.word_index)
print(len(tokenizer.word_index))
```
%% Cell type:code id: tags:
```
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
for key, value in tokenizer.word_counts.items():
if(value < threshold):
rare_cnt = rare_cnt + 1
print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
```
%% Output
단어 집합(vocabulary)의 크기 : 43752
등장 빈도가 2번 이하인 희귀 단어의 수: 24337
%% Cell type:code id: tags:
```
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)
```
%% Output
단어 집합의 크기 : 19417
%% Cell type:code id: tags:
```
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
```
%% Cell type:code id: tags:
```
max_len = 30
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)
```
%% Cell type:code id: tags:
```
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])
X_train[1]
```
%% Cell type:code id: tags:
```
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
```
%% Cell type:code id: tags:
```
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
```
%% Cell type:code id: tags:
```
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
```
%% Cell type:code id: tags:
```
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)
```
%% Output
Epoch 1/15
1944/1944 [==============================] - 159s 81ms/step - loss: 0.3858 - acc: 0.8256 - val_loss: 0.3499 - val_acc: 0.8468
Epoch 00001: val_acc improved from -inf to 0.84684, saving model to best_model.h5
Epoch 2/15
1944/1944 [==============================] - 156s 80ms/step - loss: 0.3294 - acc: 0.8567 - val_loss: 0.3359 - val_acc: 0.8548
Epoch 00002: val_acc improved from 0.84684 to 0.85476, saving model to best_model.h5
Epoch 3/15
1944/1944 [==============================] - 158s 81ms/step - loss: 0.3003 - acc: 0.8716 - val_loss: 0.3314 - val_acc: 0.8572
Epoch 00003: val_acc improved from 0.85476 to 0.85723, saving model to best_model.h5
Epoch 4/15
1944/1944 [==============================] - 155s 80ms/step - loss: 0.2806 - acc: 0.8821 - val_loss: 0.3319 - val_acc: 0.8554
Epoch 00004: val_acc did not improve from 0.85723
Epoch 5/15
1944/1944 [==============================] - 153s 79ms/step - loss: 0.2611 - acc: 0.8928 - val_loss: 0.3367 - val_acc: 0.8603
Epoch 00005: val_acc improved from 0.85723 to 0.86028, saving model to best_model.h5
Epoch 6/15
1944/1944 [==============================] - 153s 79ms/step - loss: 0.2492 - acc: 0.8998 - val_loss: 0.3365 - val_acc: 0.8559
Epoch 00006: val_acc did not improve from 0.86028
Epoch 7/15
1944/1944 [==============================] - 153s 79ms/step - loss: 0.2340 - acc: 0.9070 - val_loss: 0.3479 - val_acc: 0.8555
Epoch 00007: val_acc did not improve from 0.86028
Epoch 00007: early stopping
%% Cell type:code id: tags:
```
loaded_model = load_model('best_model.h5')
print(loaded_model.predict(X_test[:5]))
print(y_test[:5])
```
%% Output
[[0.98269826]
[0.10281318]
[0.00681123]
[0.08977833]
[0.9861894 ]]
[1 0 0 0 1]
%% Cell type:code id: tags:
```
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
```
%% Output
1532/1532 [==============================] - 19s 12ms/step - loss: 0.3434 - acc: 0.8576
테스트 정확도: 0.8576
%% Cell type:code id: tags:
```
def s_analysis(new_sentence):
new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
score = float(loaded_model.predict(pad_new)) # 예측
if(score > 0.5):
print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
else:
print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))
```
%% Cell type:code id: tags:
```
s_analysis("진짜 재미없고 최악이다")
```
%% Output
99.73% 확률로 부정 리뷰입니다.
%% Cell type:code id: tags:
```
s_analysis("진짜 최고다. 정말 박진감있고 흥미로웠다")
```
%% Output
99.76% 확률로 긍정 리뷰입니다.
%% Cell type:code id: tags:
```
s_analysis("정말 기분이 좋네요")
```
%% Output
96.36% 확률로 긍정 리뷰입니다.
%% Cell type:code id: tags:
```
s_analysis("진짜 재미없고 최악이다")
```
%% Output
99.73% 확률로 부정 리뷰입니다.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment