Upload code file

95438c6a · kimsumin · 537bf58f · 95438c6a
Commit 95438c6a authored Dec 20, 2020 by kimsumin
--- a/NLP.ipynb
+++ b/NLP.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "NLP.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LoImN07UxAn2"
+      },
+      "source": [
+        "import pandas as pd\r\n",
+        "import numpy as np\r\n",
+        "%matplotlib inline\r\n",
+        "import matplotlib.pyplot as plt\r\n",
+        "import re\r\n",
+        "import urllib.request\r\n",
+        "from tensorflow.keras.preprocessing.text import Tokenizer\r\n",
+        "from tensorflow.keras.preprocessing.sequence import pad_sequences"
+      ],
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Yf1ceqLwz_72"
+      },
+      "source": [
+        "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt\", filename=\"ratings_train.txt\")\r\n",
+        "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt\", filename=\"ratings_test.txt\")\r\n",
+        "\r\n",
+        "train_data = pd.read_table('ratings_train.txt')\r\n",
+        "test_data = pd.read_table('ratings_test.txt')"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "PfZhNCB10AWy",
+        "outputId": "ec183025-6db0-470e-b85b-9f3b22c118e0"
+      },
+      "source": [
+        "print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력\r\n",
+        "print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "훈련용 리뷰 개수 : 150000\n",
+            "테스트용 리뷰 개수 : 50000\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "dFki1IvRY8ob"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 221
+        },
+        "id": "lfEqvka20D7v",
+        "outputId": "04f01fb2-5b85-4443-ae8b-803c210daf65"
+      },
+      "source": [
+        "train_data[:5] # 상위 5개 출력"
+      ],
+      "execution_count": 20,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>id</th>\n",
+              "      <th>document</th>\n",
+              "      <th>label</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>9976970</td>\n",
+              "      <td>아 더빙.. 진짜 짜증나네요 목소리</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>3819312</td>\n",
+              "      <td>흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>10265843</td>\n",
+              "      <td>너무재밓었다그래서보는것을추천한다</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>9045019</td>\n",
+              "      <td>교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>6483659</td>\n",
+              "      <td>사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "         id                                           document  label\n",
+              "0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0\n",
+              "1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1\n",
+              "2  10265843                                  너무재밓었다그래서보는것을추천한다      0\n",
+              "3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0\n",
+              "4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 20
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0_QgSxri4rbN",
+        "outputId": "4bcdc6db-b9a1-4034-8919-ca25654f7c27"
+      },
+      "source": [
+        "train_data['document'].nunique(), train_data['label'].nunique()"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "(146182, 2)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 22
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "MPWimSwD4u5l"
+      },
+      "source": [
+        "train_data.drop_duplicates(subset=['document'], inplace=True)"
+      ],
+      "execution_count": 23,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fOpiWx7N0RU0"
+      },
+      "source": [
+        "print(test_data.isnull().values.any())"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YEbx4OUP0qNE"
+      },
+      "source": [
+        "train_data = train_data.dropna(how = 'any')"
+      ],
+      "execution_count": 100,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 221
+        },
+        "id": "vk4HP6lK0Hzx",
+        "outputId": "90be32a8-ecd3-4c6d-bc99-92d65431a7dc"
+      },
+      "source": [
+        "train_data['document'] = train_data['document'].str.replace(\"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]\",\"\")\r\n",
+        "# 한글과 공백을 제외하고 모두 제거\r\n",
+        "train_data[:5]"
+      ],
+      "execution_count": 24,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>id</th>\n",
+              "      <th>document</th>\n",
+              "      <th>label</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>9976970</td>\n",
+              "      <td>아 더빙 진짜 짜증나네요 목소리</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>3819312</td>\n",
+              "      <td>흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>10265843</td>\n",
+              "      <td>너무재밓었다그래서보는것을추천한다</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>9045019</td>\n",
+              "      <td>교도소 이야기구먼 솔직히 재미는 없다평점 조정</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>6483659</td>\n",
+              "      <td>사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "         id                                           document  label\n",
+              "0   9976970                                  아 더빙 진짜 짜증나네요 목소리      0\n",
+              "1   3819312                         흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나      1\n",
+              "2  10265843                                  너무재밓었다그래서보는것을추천한다      0\n",
+              "3   9045019                          교도소 이야기구먼 솔직히 재미는 없다평점 조정      0\n",
+              "4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...      1"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 24
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1QdaNPpr4-dL",
+        "outputId": "62e72381-4194-4738-f6c3-f1b79bead3b2"
+      },
+      "source": [
+        "train_data['document'].replace('', np.nan, inplace=True)\r\n",
+        "train_data = train_data.dropna(how = 'any')\r\n",
+        "print('전처리 후 훈련용 샘플의 개수 :',len(train_data))"
+      ],
+      "execution_count": 26,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "145791\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FlcP0_l85DZY",
+        "outputId": "aeb52df5-b12c-4994-c200-023d2941a03e"
+      },
+      "source": [
+        "test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거\r\n",
+        "test_data['document'] = test_data['document'].str.replace(\"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]\",\"\") # 정규 표현식 수행\r\n",
+        "test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경\r\n",
+        "test_data = test_data.dropna(how='any') # Null 값 제거\r\n",
+        "print('전처리 후 테스트용 샘플의 개수 :',len(test_data))"
+      ],
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "전처리 후 테스트용 샘플의 개수 : 48995\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RYvpBwSM5Fq-",
+        "outputId": "795adda9-9d42-400d-f88c-a97b56fcd691"
+      },
+      "source": [
+        "!pip install konlpy"
+      ],
+      "execution_count": 27,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting konlpy\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)\n",
+            "\u001b[K     |████████████████████████████████| 19.4MB 1.3MB/s \n",
+            "\u001b[?25hCollecting beautifulsoup4==4.6.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)\n",
+            "\u001b[K     |████████████████████████████████| 92kB 8.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.19.4)\n",
+            "Collecting JPype1>=0.7.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e31d351e8278a8a63c0/JPype1-1.2.0-cp36-cp36m-manylinux2010_x86_64.whl (453kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 42.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)\n",
+            "Collecting colorama\n",
+            "  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl\n",
+            "Collecting tweepy>=3.7.0\n",
+            "  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl\n",
+            "Requirement already satisfied: typing-extensions; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.3)\n",
+            "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)\n",
+            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)\n",
+            "Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)\n",
+            "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.12.5)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)\n",
+            "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == \"socks\" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)\n",
+            "Installing collected packages: beautifulsoup4, JPype1, colorama, tweepy, konlpy\n",
+            "  Found existing installation: beautifulsoup4 4.6.3\n",
+            "    Uninstalling beautifulsoup4-4.6.3:\n",
+            "      Successfully uninstalled beautifulsoup4-4.6.3\n",
+            "  Found existing installation: tweepy 3.6.0\n",
+            "    Uninstalling tweepy-3.6.0:\n",
+            "      Successfully uninstalled tweepy-3.6.0\n",
+            "Successfully installed JPype1-1.2.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2 tweepy-3.9.0\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nTUUKfxg5MvF"
+      },
+      "source": [
+        "from konlpy.tag import Okt"
+      ],
+      "execution_count": 28,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "BP51L-LQ5OlN",
+        "outputId": "375dc8c4-413d-4c84-f82c-63a506530d85"
+      },
+      "source": [
+        "okt = Okt()\r\n",
+        "print(train_data['document'][1])\r\n",
+        "okt.morphs(train_data['document'][1], stem = True)"
+      ],
+      "execution_count": 101,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다']"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 101
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FlmMJ8kD5P9h"
+      },
+      "source": [
+        "stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']\r\n",
+        "\r\n",
+        "X_train = []\r\n",
+        "for sentence in train_data['document']:\r\n",
+        "    temp_X = []\r\n",
+        "    temp_X = okt.morphs(sentence, stem=True) # 토큰화\r\n",
+        "    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거a\r\n",
+        "    X_train.append(temp_X)\r\n",
+        "\r\n",
+        "X_test = []\r\n",
+        "for sentence in test_data['document']:\r\n",
+        "    temp_X = []\r\n",
+        "    temp_X = okt.morphs(sentence, stem=True) # 토큰화\r\n",
+        "    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거\r\n",
+        "    X_test.append(temp_X)"
+      ],
+      "execution_count": 36,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fh0UexgqBydS",
+        "outputId": "3463110d-81e1-4d93-b844-08cb3d01dd85"
+      },
+      "source": [
+        "print(X_train[:3])"
+      ],
+      "execution_count": 61,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[[51, 455, 17, 261, 660], [934, 458, 42, 603, 2, 215, 1450, 25, 962, 676, 20], [387, 2445, 1, 2316, 5672, 3, 223, 10]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "waEO9jB7B9ql"
+      },
+      "source": [
+        "tokenizer = Tokenizer()\r\n",
+        "tokenizer.fit_on_texts(X_train)\r\n",
+        "print(tokenizer.word_index)\r\n",
+        "print(len(tokenizer.word_index))"
+      ],
+      "execution_count": 38,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Sgd-Hwq7B_B2",
+        "outputId": "244c8165-9a59-4649-dee8-fa95a6f6ddcc"
+      },
+      "source": [
+        "threshold = 3\r\n",
+        "total_cnt = len(tokenizer.word_index) # 단어의 수\r\n",
+        "rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트\r\n",
+        "\r\n",
+        "for key, value in tokenizer.word_counts.items():\r\n",
+        "    if(value < threshold):\r\n",
+        "        rare_cnt = rare_cnt + 1\r\n",
+        "\r\n",
+        "print('단어 집합(vocabulary)의 크기 :',total_cnt)\r\n",
+        "print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))"
+      ],
+      "execution_count": 39,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "단어 집합(vocabulary)의 크기 : 43752\n",
+            "등장 빈도가 2번 이하인 희귀 단어의 수: 24337\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "wrQZFEUVCDRa",
+        "outputId": "0835f163-6876-4fa0-f7b8-caf27a386ba1"
+      },
+      "source": [
+        "vocab_size = total_cnt - rare_cnt + 2\r\n",
+        "print('단어 집합의 크기 :',vocab_size)"
+      ],
+      "execution_count": 40,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "단어 집합의 크기 : 19417\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "L2D8IT7xCFf9"
+      },
+      "source": [
+        "tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') \r\n",
+        "tokenizer.fit_on_texts(X_train)\r\n",
+        "X_train = tokenizer.texts_to_sequences(X_train)\r\n",
+        "X_test = tokenizer.texts_to_sequences(X_test)"
+      ],
+      "execution_count": 41,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-v7C4X6tCILL"
+      },
+      "source": [
+        "max_len = 30\r\n",
+        "X_train = pad_sequences(X_train, maxlen = max_len)\r\n",
+        "X_test = pad_sequences(X_test, maxlen = max_len)"
+      ],
+      "execution_count": 63,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zojktSc1G-o8"
+      },
+      "source": [
+        "y_train = np.array(train_data['label'])\r\n",
+        "y_test = np.array(test_data['label'])\r\n",
+        "X_train[1]"
+      ],
+      "execution_count": 65,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nZA9D39RGNkT"
+      },
+      "source": [
+        "from tensorflow.keras.layers import Embedding, Dense, LSTM\r\n",
+        "from tensorflow.keras.models import Sequential\r\n",
+        "from tensorflow.keras.models import load_model\r\n",
+        "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint"
+      ],
+      "execution_count": 57,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "uVbgjA9EGWAw"
+      },
+      "source": [
+        "model = Sequential()\r\n",
+        "model.add(Embedding(vocab_size, 100))\r\n",
+        "model.add(LSTM(128))\r\n",
+        "model.add(Dense(1, activation='sigmoid'))"
+      ],
+      "execution_count": 58,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "kfN8Z-DGGXco"
+      },
+      "source": [
+        "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\r\n",
+        "mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)"
+      ],
+      "execution_count": 59,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "iNFXL9s8GY1t",
+        "outputId": "36c4ce96-a1f5-4baa-ee10-88cc9722fede"
+      },
+      "source": [
+        "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\r\n",
+        "history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)"
+      ],
+      "execution_count": 69,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Epoch 1/15\n",
+            "1944/1944 [==============================] - 159s 81ms/step - loss: 0.3858 - acc: 0.8256 - val_loss: 0.3499 - val_acc: 0.8468\n",
+            "\n",
+            "Epoch 00001: val_acc improved from -inf to 0.84684, saving model to best_model.h5\n",
+            "Epoch 2/15\n",
+            "1944/1944 [==============================] - 156s 80ms/step - loss: 0.3294 - acc: 0.8567 - val_loss: 0.3359 - val_acc: 0.8548\n",
+            "\n",
+            "Epoch 00002: val_acc improved from 0.84684 to 0.85476, saving model to best_model.h5\n",
+            "Epoch 3/15\n",
+            "1944/1944 [==============================] - 158s 81ms/step - loss: 0.3003 - acc: 0.8716 - val_loss: 0.3314 - val_acc: 0.8572\n",
+            "\n",
+            "Epoch 00003: val_acc improved from 0.85476 to 0.85723, saving model to best_model.h5\n",
+            "Epoch 4/15\n",
+            "1944/1944 [==============================] - 155s 80ms/step - loss: 0.2806 - acc: 0.8821 - val_loss: 0.3319 - val_acc: 0.8554\n",
+            "\n",
+            "Epoch 00004: val_acc did not improve from 0.85723\n",
+            "Epoch 5/15\n",
+            "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2611 - acc: 0.8928 - val_loss: 0.3367 - val_acc: 0.8603\n",
+            "\n",
+            "Epoch 00005: val_acc improved from 0.85723 to 0.86028, saving model to best_model.h5\n",
+            "Epoch 6/15\n",
+            "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2492 - acc: 0.8998 - val_loss: 0.3365 - val_acc: 0.8559\n",
+            "\n",
+            "Epoch 00006: val_acc did not improve from 0.86028\n",
+            "Epoch 7/15\n",
+            "1944/1944 [==============================] - 153s 79ms/step - loss: 0.2340 - acc: 0.9070 - val_loss: 0.3479 - val_acc: 0.8555\n",
+            "\n",
+            "Epoch 00007: val_acc did not improve from 0.86028\n",
+            "Epoch 00007: early stopping\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LgDrITELGaCI",
+        "outputId": "22ea3bbb-0f9b-4faa-9001-d1c2334e714e"
+      },
+      "source": [
+        "loaded_model = load_model('best_model.h5')\r\n",
+        "print(loaded_model.predict(X_test[:5]))\r\n",
+        "print(y_test[:5])"
+      ],
+      "execution_count": 89,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[[0.98269826]\n",
+            " [0.10281318]\n",
+            " [0.00681123]\n",
+            " [0.08977833]\n",
+            " [0.9861894 ]]\n",
+            "[1 0 0 0 1]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "zgh7HidIQLXX",
+        "outputId": "be94275b-85a4-45e6-fe96-3373f62a0f7c"
+      },
+      "source": [
+        "print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))"
+      ],
+      "execution_count": 90,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1532/1532 [==============================] - 19s 12ms/step - loss: 0.3434 - acc: 0.8576\n",
+            "\n",
+            " 테스트 정확도: 0.8576\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Ux5IGakqKA49"
+      },
+      "source": [
+        "def s_analysis(new_sentence):\r\n",
+        "  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화\r\n",
+        "  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거\r\n",
+        "  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩\r\n",
+        "  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩\r\n",
+        "  score = float(loaded_model.predict(pad_new)) # 예측\r\n",
+        "  if(score > 0.5):\r\n",
+        "    print(\"{:.2f}% 확률로 긍정 리뷰입니다.\\n\".format(score * 100))\r\n",
+        "  else:\r\n",
+        "    print(\"{:.2f}% 확률로 부정 리뷰입니다.\\n\".format((1 - score) * 100))"
+      ],
+      "execution_count": 92,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "aFOBFmYMQKrX",
+        "outputId": "b4ddf99b-b0b3-4131-b800-ec6fecc86a7b"
+      },
+      "source": [
+        "s_analysis(\"진짜 재미없고 최악이다\")"
+      ],
+      "execution_count": 103,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "99.73% 확률로 부정 리뷰입니다.\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "aNfGM0fsQiTE",
+        "outputId": "8d8e62ac-a9fe-4850-d581-3f3d919fe526"
+      },
+      "source": [
+        "s_analysis(\"진짜 최고다. 정말 박진감있고 흥미로웠다\")"
+      ],
+      "execution_count": 95,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "99.76% 확률로 긍정 리뷰입니다.\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ue9Yvi0MQr8E",
+        "outputId": "a24e5877-881f-4d54-905f-21cc84321938"
+      },
+      "source": [
+        "s_analysis(\"정말 기분이 좋네요\")"
+      ],
+      "execution_count": 99,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "96.36% 확률로 긍정 리뷰입니다.\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1z_X6QeuQ0gP",
+        "outputId": "f193e2e5-34e9-4698-b172-a1b68f7146a2"
+      },
+      "source": [
+        "s_analysis(\"진짜 재미없고 최악이다\")"
+      ],
+      "execution_count": 102,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "99.73% 확률로 부정 리뷰입니다.\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
+%% Cell type:code id: tags:
+
+``` 
+import pandas as pd
+import numpy as np
+%matplotlib inline
+import matplotlib.pyplot as plt
+import re
+import urllib.request
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+```
+
+%% Cell type:code id: tags:
+
+``` 
+urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
+urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
+
+train_data = pd.read_table('ratings_train.txt')
+test_data = pd.read_table('ratings_test.txt')
+```
+
+%% Cell type:code id: tags:
+
+``` 
+print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력
+print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력
+```
+
+%% Output
+
+    훈련용 리뷰 개수 : 150000
+    테스트용 리뷰 개수 : 50000
+
+%% Cell type:code id: tags:
+
+``` 
+
+```
+
+%% Cell type:code id: tags:
+
+``` 
+train_data[:5] # 상위 5개 출력
+```
+
+%% Output
+
+             id                                           document  label
+    0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0
+    1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
+    2  10265843                                  너무재밓었다그래서보는것을추천한다      0
+    3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
+    4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1
+
+%% Cell type:code id: tags:
+
+``` 
+train_data['document'].nunique(), train_data['label'].nunique()
+```
+
+%% Output
+
+    (146182, 2)
+
+%% Cell type:code id: tags:
+
+``` 
+train_data.drop_duplicates(subset=['document'], inplace=True)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+print(test_data.isnull().values.any())
+```
+
+%% Cell type:code id: tags:
+
+``` 
+train_data = train_data.dropna(how = 'any')
+```
+
+%% Cell type:code id: tags:
+
+``` 
+train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
+# 한글과 공백을 제외하고 모두 제거
+train_data[:5]
+```
+
+%% Output
+
+             id                                           document  label
+    0   9976970                                  아 더빙 진짜 짜증나네요 목소리      0
+    1   3819312                         흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나      1
+    2  10265843                                  너무재밓었다그래서보는것을추천한다      0
+    3   9045019                          교도소 이야기구먼 솔직히 재미는 없다평점 조정      0
+    4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...      1
+
+%% Cell type:code id: tags:
+
+``` 
+train_data['document'].replace('', np.nan, inplace=True)
+train_data = train_data.dropna(how = 'any')
+print('전처리 후 훈련용 샘플의 개수 :',len(train_data))
+```
+
+%% Output
+
+    145791
+
+%% Cell type:code id: tags:
+
+``` 
+test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
+test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
+test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
+test_data = test_data.dropna(how='any') # Null 값 제거
+print('전처리 후 테스트용 샘플의 개수 :',len(test_data))
+```
+
+%% Output
+
+    전처리 후 테스트용 샘플의 개수 : 48995
+
+%% Cell type:code id: tags:
+
+``` 
+!pip install konlpy
+```
+
+%% Output
+
+    Collecting konlpy
+    [?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
+    [K     |████████████████████████████████| 19.4MB 1.3MB/s
+    [?25hCollecting beautifulsoup4==4.6.0
+    [?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
+    [K     |████████████████████████████████| 92kB 8.1MB/s
+    [?25hRequirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.19.4)
+    Collecting JPype1>=0.7.0
+    [?25l  Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e31d351e8278a8a63c0/JPype1-1.2.0-cp36-cp36m-manylinux2010_x86_64.whl (453kB)
+    [K     |████████████████████████████████| 460kB 42.1MB/s
+    [?25hRequirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)
+    Collecting colorama
+      Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
+    Collecting tweepy>=3.7.0
+      Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
+    Requirement already satisfied: typing-extensions; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.3)
+    Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)
+    Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)
+    Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)
+    Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)
+    Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)
+    Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)
+    Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.12.5)
+    Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)
+    Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == "socks" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)
+    Installing collected packages: beautifulsoup4, JPype1, colorama, tweepy, konlpy
+      Found existing installation: beautifulsoup4 4.6.3
+        Uninstalling beautifulsoup4-4.6.3:
+          Successfully uninstalled beautifulsoup4-4.6.3
+      Found existing installation: tweepy 3.6.0
+        Uninstalling tweepy-3.6.0:
+          Successfully uninstalled tweepy-3.6.0
+    Successfully installed JPype1-1.2.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2 tweepy-3.9.0
+
+%% Cell type:code id: tags:
+
+``` 
+from konlpy.tag import Okt
+```
+
+%% Cell type:code id: tags:
+
+``` 
+okt = Okt()
+print(train_data['document'][1])
+okt.morphs(train_data['document'][1], stem = True)
+```
+
+%% Output
+
+    흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나
+
+    ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다']
+
+%% Cell type:code id: tags:
+
+``` 
+stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
+
+X_train = []
+for sentence in train_data['document']:
+    temp_X = []
+    temp_X = okt.morphs(sentence, stem=True) # 토큰화
+    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거a
+    X_train.append(temp_X)
+
+X_test = []
+for sentence in test_data['document']:
+    temp_X = []
+    temp_X = okt.morphs(sentence, stem=True) # 토큰화
+    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
+    X_test.append(temp_X)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+print(X_train[:3])
+```
+
+%% Output
+
+    [[51, 455, 17, 261, 660], [934, 458, 42, 603, 2, 215, 1450, 25, 962, 676, 20], [387, 2445, 1, 2316, 5672, 3, 223, 10]]
+
+%% Cell type:code id: tags:
+
+``` 
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(X_train)
+print(tokenizer.word_index)
+print(len(tokenizer.word_index))
+```
+
+%% Cell type:code id: tags:
+
+``` 
+threshold = 3
+total_cnt = len(tokenizer.word_index) # 단어의 수
+rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
+
+for key, value in tokenizer.word_counts.items():
+    if(value < threshold):
+        rare_cnt = rare_cnt + 1
+
+print('단어 집합(vocabulary)의 크기 :',total_cnt)
+print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
+```
+
+%% Output
+
+    단어 집합(vocabulary)의 크기 : 43752
+    등장 빈도가 2번 이하인 희귀 단어의 수: 24337
+
+%% Cell type:code id: tags:
+
+``` 
+vocab_size = total_cnt - rare_cnt + 2
+print('단어 집합의 크기 :',vocab_size)
+```
+
+%% Output
+
+    단어 집합의 크기 : 19417
+
+%% Cell type:code id: tags:
+
+``` 
+tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
+tokenizer.fit_on_texts(X_train)
+X_train = tokenizer.texts_to_sequences(X_train)
+X_test = tokenizer.texts_to_sequences(X_test)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+max_len = 30
+X_train = pad_sequences(X_train, maxlen = max_len)
+X_test = pad_sequences(X_test, maxlen = max_len)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+y_train = np.array(train_data['label'])
+y_test = np.array(test_data['label'])
+X_train[1]
+```
+
+%% Cell type:code id: tags:
+
+``` 
+from tensorflow.keras.layers import Embedding, Dense, LSTM
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.models import load_model
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+```
+
+%% Cell type:code id: tags:
+
+``` 
+model = Sequential()
+model.add(Embedding(vocab_size, 100))
+model.add(LSTM(128))
+model.add(Dense(1, activation='sigmoid'))
+```
+
+%% Cell type:code id: tags:
+
+``` 
+es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
+mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
+history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)
+```
+
+%% Output
+
+    Epoch 1/15
+    1944/1944 [==============================] - 159s 81ms/step - loss: 0.3858 - acc: 0.8256 - val_loss: 0.3499 - val_acc: 0.8468
+    
+    Epoch 00001: val_acc improved from -inf to 0.84684, saving model to best_model.h5
+    Epoch 2/15
+    1944/1944 [==============================] - 156s 80ms/step - loss: 0.3294 - acc: 0.8567 - val_loss: 0.3359 - val_acc: 0.8548
+    
+    Epoch 00002: val_acc improved from 0.84684 to 0.85476, saving model to best_model.h5
+    Epoch 3/15
+    1944/1944 [==============================] - 158s 81ms/step - loss: 0.3003 - acc: 0.8716 - val_loss: 0.3314 - val_acc: 0.8572
+    
+    Epoch 00003: val_acc improved from 0.85476 to 0.85723, saving model to best_model.h5
+    Epoch 4/15
+    1944/1944 [==============================] - 155s 80ms/step - loss: 0.2806 - acc: 0.8821 - val_loss: 0.3319 - val_acc: 0.8554
+    
+    Epoch 00004: val_acc did not improve from 0.85723
+    Epoch 5/15
+    1944/1944 [==============================] - 153s 79ms/step - loss: 0.2611 - acc: 0.8928 - val_loss: 0.3367 - val_acc: 0.8603
+    
+    Epoch 00005: val_acc improved from 0.85723 to 0.86028, saving model to best_model.h5
+    Epoch 6/15
+    1944/1944 [==============================] - 153s 79ms/step - loss: 0.2492 - acc: 0.8998 - val_loss: 0.3365 - val_acc: 0.8559
+    
+    Epoch 00006: val_acc did not improve from 0.86028
+    Epoch 7/15
+    1944/1944 [==============================] - 153s 79ms/step - loss: 0.2340 - acc: 0.9070 - val_loss: 0.3479 - val_acc: 0.8555
+    
+    Epoch 00007: val_acc did not improve from 0.86028
+    Epoch 00007: early stopping
+
+%% Cell type:code id: tags:
+
+``` 
+loaded_model = load_model('best_model.h5')
+print(loaded_model.predict(X_test[:5]))
+print(y_test[:5])
+```
+
+%% Output
+
+    [[0.98269826]
+     [0.10281318]
+     [0.00681123]
+     [0.08977833]
+     [0.9861894 ]]
+    [1 0 0 0 1]
+
+%% Cell type:code id: tags:
+
+``` 
+print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
+```
+
+%% Output
+
+    1532/1532 [==============================] - 19s 12ms/step - loss: 0.3434 - acc: 0.8576
+    
+     테스트 정확도: 0.8576
+
+%% Cell type:code id: tags:
+
+``` 
+def s_analysis(new_sentence):
+  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
+  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
+  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
+  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
+  score = float(loaded_model.predict(pad_new)) # 예측
+  if(score > 0.5):
+    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
+  else:
+    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))
+```
+
+%% Cell type:code id: tags:
+
+``` 
+s_analysis("진짜 재미없고 최악이다")
+```
+
+%% Output
+
+    99.73% 확률로 부정 리뷰입니다.
+    
+
+%% Cell type:code id: tags:
+
+``` 
+s_analysis("진짜 최고다. 정말 박진감있고 흥미로웠다")
+```
+
+%% Output
+
+    99.76% 확률로 긍정 리뷰입니다.
+    
+
+%% Cell type:code id: tags:
+
+``` 
+s_analysis("정말 기분이 좋네요")
+```
+
+%% Output
+
+    96.36% 확률로 긍정 리뷰입니다.
+    
+
+%% Cell type:code id: tags:
+
+``` 
+s_analysis("진짜 재미없고 최악이다")
+```
+
+%% Output
+
+    99.73% 확률로 부정 리뷰입니다.
+