diff --git a/NLP.ipynb b/NLP.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fc53a3a0de9b70584f5d2c3d3e9ea5f84838b686 --- /dev/null +++ b/NLP.ipynb @@ -0,0 +1,912 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "NLP.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "LoImN07UxAn2" + }, + "source": [ + "import pandas as pd\r\n", + "import numpy as np\r\n", + "%matplotlib inline\r\n", + "import matplotlib.pyplot as plt\r\n", + "import re\r\n", + "import urllib.request\r\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\r\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Yf1ceqLwz_72" + }, + "source": [ + "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt\", filename=\"ratings_train.txt\")\r\n", + "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt\", filename=\"ratings_test.txt\")\r\n", + "\r\n", + "train_data = pd.read_table('ratings_train.txt')\r\n", + "test_data = pd.read_table('ratings_test.txt')" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PfZhNCB10AWy", + "outputId": "ec183025-6db0-470e-b85b-9f3b22c118e0" + }, + "source": [ + "print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력\r\n", + "print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "훈련용 리뷰 개수 : 150000\n", + "테스트용 리뷰 개수 : 50000\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dFki1IvRY8ob" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "lfEqvka20D7v", + "outputId": "04f01fb2-5b85-4443-ae8b-803c210daf65" + }, + "source": [ + "train_data[:5] # 상위 5개 출력" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>document</th>\n", + " <th>label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>9976970</td>\n", + " <td>아 더빙.. 진짜 짜증나네요 목소리</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3819312</td>\n", + " <td>흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>10265843</td>\n", + " <td>너무재밓었다그래서보는것을추천한다</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9045019</td>\n", + " <td>교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>6483659</td>\n", + " <td>사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id document label\n", + "0 9976970 아 더빙.. 진짜 짜증나네요 목소리 0\n", + "1 3819312 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 1\n", + "2 10265843 너무재밓었다그래서보는것을추천한다 0\n", + "3 9045019 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 0\n", + "4 6483659 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... 1" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0_QgSxri4rbN", + "outputId": "4bcdc6db-b9a1-4034-8919-ca25654f7c27" + }, + "source": [ + "train_data['document'].nunique(), train_data['label'].nunique()" + ], + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(146182, 2)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MPWimSwD4u5l" + }, + "source": [ + "train_data.drop_duplicates(subset=['document'], inplace=True)" + ], + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "fOpiWx7N0RU0" + }, + "source": [ + "print(test_data.isnull().values.any())" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YEbx4OUP0qNE" + }, + "source": [ + "train_data = train_data.dropna(how = 'any')" + ], + "execution_count": 100, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "vk4HP6lK0Hzx", + "outputId": "90be32a8-ecd3-4c6d-bc99-92d65431a7dc" + }, + "source": [ + "train_data['document'] = train_data['document'].str.replace(\"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]\",\"\")\r\n", + "# 한글과 공백을 제외하고 모두 제거\r\n", + "train_data[:5]" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>document</th>\n", + " <th>label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>9976970</td>\n", + " <td>아 더빙 진짜 짜증나네요 목소리</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3819312</td>\n", + " <td>흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>10265843</td>\n", + " <td>너무재밓었다그래서보는것을추천한다</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9045019</td>\n", + " <td>교도소 이야기구먼 솔직히 재미는 없다평점 조정</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>6483659</td>\n", + " <td>사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id document label\n", + "0 9976970 아 더빙 진짜 짜증나네요 목소리 0\n", + "1 3819312 흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나 1\n", + "2 10265843 너무재밓었다그래서보는것을추천한다 0\n", + "3 9045019 교도소 이야기구먼 솔직히 재미는 없다평점 조정 0\n", + "4 6483659 사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던... 1" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1QdaNPpr4-dL", + "outputId": "62e72381-4194-4738-f6c3-f1b79bead3b2" + }, + "source": [ + "train_data['document'].replace('', np.nan, inplace=True)\r\n", + "train_data = train_data.dropna(how = 'any')\r\n", + "print('전처리 후 훈련용 샘플의 개수 :',len(train_data))" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + "145791\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FlcP0_l85DZY", + "outputId": "aeb52df5-b12c-4994-c200-023d2941a03e" + }, + "source": [ + "test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거\r\n", + "test_data['document'] = test_data['document'].str.replace(\"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]\",\"\") # 정규 표현식 수행\r\n", + "test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경\r\n", + "test_data = test_data.dropna(how='any') # Null 값 제거\r\n", + "print('전처리 후 테스트용 샘플의 개수 :',len(test_data))" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "text": [ + "전처리 후 테스트용 샘플의 개수 : 48995\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RYvpBwSM5Fq-", + "outputId": "795adda9-9d42-400d-f88c-a97b56fcd691" + }, + "source": [ + "!pip install konlpy" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting konlpy\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)\n", + "\u001b[K |████████████████████████████████| 19.4MB 1.3MB/s \n", + "\u001b[?25hCollecting beautifulsoup4==4.6.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)\n", + "\u001b[K |████████████████████████████████| 92kB 8.1MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.19.4)\n", + "Collecting JPype1>=0.7.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e31d351e8278a8a63c0/JPype1-1.2.0-cp36-cp36m-manylinux2010_x86_64.whl (453kB)\n", + "\u001b[K |████████████████████████████████| 460kB 42.1MB/s \n", + "\u001b[?25hRequirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)\n", + "Collecting colorama\n", + " Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl\n", + "Collecting tweepy>=3.7.0\n", + " Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: typing-extensions; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.3)\n", + "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)\n", + "Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.12.5)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == \"socks\" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)\n", + "Installing collected packages: beautifulsoup4, JPype1, colorama, tweepy, konlpy\n", + " Found existing installation: beautifulsoup4 4.6.3\n", + " Uninstalling beautifulsoup4-4.6.3:\n", + " Successfully uninstalled beautifulsoup4-4.6.3\n", + " Found existing installation: tweepy 3.6.0\n", + " Uninstalling tweepy-3.6.0:\n", + " Successfully uninstalled tweepy-3.6.0\n", + "Successfully installed JPype1-1.2.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2 tweepy-3.9.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nTUUKfxg5MvF" + }, + "source": [ + "from konlpy.tag import Okt" + ], + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BP51L-LQ5OlN", + "outputId": "375dc8c4-413d-4c84-f82c-63a506530d85" + }, + "source": [ + "okt = Okt()\r\n", + "print(train_data['document'][1])\r\n", + "okt.morphs(train_data['document'][1], stem = True)" + ], + "execution_count": 101, + "outputs": [ + { + "output_type": "stream", + "text": [ + "흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 101 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FlmMJ8kD5P9h" + }, + "source": [ + "stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']\r\n", + "\r\n", + "X_train = []\r\n", + "for sentence in train_data['document']:\r\n", + " temp_X = []\r\n", + " temp_X = okt.morphs(sentence, stem=True) # 토큰화\r\n", + " temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거a\r\n", + " X_train.append(temp_X)\r\n", + "\r\n", + "X_test = []\r\n", + "for sentence in test_data['document']:\r\n", + " temp_X = []\r\n", + " temp_X = okt.morphs(sentence, stem=True) # 토큰화\r\n", + " temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거\r\n", + " X_test.append(temp_X)" + ], + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fh0UexgqBydS", + "outputId": "3463110d-81e1-4d93-b844-08cb3d01dd85" + }, + "source": [ + "print(X_train[:3])" + ], + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[[51, 455, 17, 261, 660], [934, 458, 42, 603, 2, 215, 1450, 25, 962, 676, 20], [387, 2445, 1, 2316, 5672, 3, 223, 10]]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "waEO9jB7B9ql" + }, + "source": [ + "tokenizer = Tokenizer()\r\n", + "tokenizer.fit_on_texts(X_train)\r\n", + "print(tokenizer.word_index)\r\n", + "print(len(tokenizer.word_index))" + ], + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sgd-Hwq7B_B2", + "outputId": "244c8165-9a59-4649-dee8-fa95a6f6ddcc" + }, + "source": [ + "threshold = 3\r\n", + "total_cnt = len(tokenizer.word_index) # 단어의 수\r\n", + "rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트\r\n", + "\r\n", + "for key, value in tokenizer.word_counts.items():\r\n", + " if(value < threshold):\r\n", + " rare_cnt = rare_cnt + 1\r\n", + "\r\n", + "print('단어 집합(vocabulary)의 크기 :',total_cnt)\r\n", + "print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "stream", + "text": [ + "단어 집합(vocabulary)의 크기 : 43752\n", + "등장 빈도가 2번 이하인 희귀 단어의 수: 24337\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wrQZFEUVCDRa", + "outputId": "0835f163-6876-4fa0-f7b8-caf27a386ba1" + }, + "source": [ + "vocab_size = total_cnt - rare_cnt + 2\r\n", + "print('단어 집합의 크기 :',vocab_size)" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "단어 집합의 크기 : 19417\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "L2D8IT7xCFf9" + }, + "source": [ + "tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') \r\n", + "tokenizer.fit_on_texts(X_train)\r\n", + "X_train = tokenizer.texts_to_sequences(X_train)\r\n", + "X_test = tokenizer.texts_to_sequences(X_test)" + ], + "execution_count": 41, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-v7C4X6tCILL" + }, + "source": [ + "max_len = 30\r\n", + "X_train = pad_sequences(X_train, maxlen = max_len)\r\n", + "X_test = pad_sequences(X_test, maxlen = max_len)" + ], + "execution_count": 63, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zojktSc1G-o8" + }, + "source": [ + "y_train = np.array(train_data['label'])\r\n", + "y_test = np.array(test_data['label'])\r\n", + "X_train[1]" + ], + "execution_count": 65, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nZA9D39RGNkT" + }, + "source": [ + "from tensorflow.keras.layers import Embedding, Dense, LSTM\r\n", + "from tensorflow.keras.models import Sequential\r\n", + "from tensorflow.keras.models import load_model\r\n", + "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint" + ], + "execution_count": 57, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uVbgjA9EGWAw" + }, + "source": [ + "model = Sequential()\r\n", + "model.add(Embedding(vocab_size, 100))\r\n", + "model.add(LSTM(128))\r\n", + "model.add(Dense(1, activation='sigmoid'))" + ], + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "kfN8Z-DGGXco" + }, + "source": [ + "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\r\n", + "mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)" + ], + "execution_count": 59, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iNFXL9s8GY1t", + "outputId": "36c4ce96-a1f5-4baa-ee10-88cc9722fede" + }, + "source": [ + "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\r\n", + "history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)" + ], + "execution_count": 69, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Epoch 1/15\n", + "1944/1944 [==============================] - 159s 81ms/step - loss: 0.3858 - acc: 0.8256 - val_loss: 0.3499 - val_acc: 0.8468\n", + "\n", + "Epoch 00001: val_acc improved from -inf to 0.84684, saving model to best_model.h5\n", + "Epoch 2/15\n", + "1944/1944 [==============================] - 156s 80ms/step - loss: 0.3294 - acc: 0.8567 - val_loss: 0.3359 - val_acc: 0.8548\n", + "\n", + "Epoch 00002: val_acc improved from 0.84684 to 0.85476, saving model to best_model.h5\n", + "Epoch 3/15\n", + "1944/1944 [==============================] - 158s 81ms/step - loss: 0.3003 - acc: 0.8716 - val_loss: 0.3314 - val_acc: 0.8572\n", + "\n", + "Epoch 00003: val_acc improved from 0.85476 to 0.85723, saving model to best_model.h5\n", + "Epoch 4/15\n", + "1944/1944 [==============================] - 155s 80ms/step - loss: 0.2806 - acc: 0.8821 - val_loss: 0.3319 - val_acc: 0.8554\n", + "\n", + "Epoch 00004: val_acc did not improve from 0.85723\n", + "Epoch 5/15\n", + "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2611 - acc: 0.8928 - val_loss: 0.3367 - val_acc: 0.8603\n", + "\n", + "Epoch 00005: val_acc improved from 0.85723 to 0.86028, saving model to best_model.h5\n", + "Epoch 6/15\n", + "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2492 - acc: 0.8998 - val_loss: 0.3365 - val_acc: 0.8559\n", + "\n", + "Epoch 00006: val_acc did not improve from 0.86028\n", + "Epoch 7/15\n", + "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2340 - acc: 0.9070 - val_loss: 0.3479 - val_acc: 0.8555\n", + "\n", + "Epoch 00007: val_acc did not improve from 0.86028\n", + "Epoch 00007: early stopping\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LgDrITELGaCI", + "outputId": "22ea3bbb-0f9b-4faa-9001-d1c2334e714e" + }, + "source": [ + "loaded_model = load_model('best_model.h5')\r\n", + "print(loaded_model.predict(X_test[:5]))\r\n", + "print(y_test[:5])" + ], + "execution_count": 89, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[[0.98269826]\n", + " [0.10281318]\n", + " [0.00681123]\n", + " [0.08977833]\n", + " [0.9861894 ]]\n", + "[1 0 0 0 1]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zgh7HidIQLXX", + "outputId": "be94275b-85a4-45e6-fe96-3373f62a0f7c" + }, + "source": [ + "print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))" + ], + "execution_count": 90, + "outputs": [ + { + "output_type": "stream", + "text": [ + "1532/1532 [==============================] - 19s 12ms/step - loss: 0.3434 - acc: 0.8576\n", + "\n", + " 테스트 정확도: 0.8576\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ux5IGakqKA49" + }, + "source": [ + "def s_analysis(new_sentence):\r\n", + " new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화\r\n", + " new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거\r\n", + " encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩\r\n", + " pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩\r\n", + " score = float(loaded_model.predict(pad_new)) # 예측\r\n", + " if(score > 0.5):\r\n", + " print(\"{:.2f}% 확률로 긍정 리뷰입니다.\\n\".format(score * 100))\r\n", + " else:\r\n", + " print(\"{:.2f}% 확률로 부정 리뷰입니다.\\n\".format((1 - score) * 100))" + ], + "execution_count": 92, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aFOBFmYMQKrX", + "outputId": "b4ddf99b-b0b3-4131-b800-ec6fecc86a7b" + }, + "source": [ + "s_analysis(\"진짜 재미없고 최악이다\")" + ], + "execution_count": 103, + "outputs": [ + { + "output_type": "stream", + "text": [ + "99.73% 확률로 부정 리뷰입니다.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aNfGM0fsQiTE", + "outputId": "8d8e62ac-a9fe-4850-d581-3f3d919fe526" + }, + "source": [ + "s_analysis(\"진짜 최고다. 정말 박진감있고 흥미로웠다\")" + ], + "execution_count": 95, + "outputs": [ + { + "output_type": "stream", + "text": [ + "99.76% 확률로 긍정 리뷰입니다.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ue9Yvi0MQr8E", + "outputId": "a24e5877-881f-4d54-905f-21cc84321938" + }, + "source": [ + "s_analysis(\"정말 기분이 좋네요\")" + ], + "execution_count": 99, + "outputs": [ + { + "output_type": "stream", + "text": [ + "96.36% 확률로 긍정 리뷰입니다.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1z_X6QeuQ0gP", + "outputId": "f193e2e5-34e9-4698-b172-a1b68f7146a2" + }, + "source": [ + "s_analysis(\"진짜 재미없고 최악이다\")" + ], + "execution_count": 102, + "outputs": [ + { + "output_type": "stream", + "text": [ + "99.73% 확률로 부정 리뷰입니다.\n", + "\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file