using DM_data.csv

98b3a7e0 · 지수 · 7d44809d · 98b3a7e0
Commit 98b3a7e0 authored Nov 26, 2021 by 지수
--- a/[DM]_Naive_Bayes.ipynb
+++ b/[DM]_Naive_Bayes.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "[DM] Naive_Bayes.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyNQ0EbVS6CTLuq4m8RIc+fn",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5D_Naive_Bayes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "p_S1iryH1NBB"
+      },
+      "source": [
+        "import os\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "import time"
+      ],
+      "execution_count": 1,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "S3PZhNLC1daY",
+        "outputId": "0a4f496e-8485-441d-eff6-d7bc67c3886e"
+      },
+      "source": [
+        "data = pd.read_csv('DM_data.csv')\n",
+        "data.info()"
+      ],
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "<class 'pandas.core.frame.DataFrame'>\n",
+            "RangeIndex: 25192 entries, 0 to 25191\n",
+            "Data columns (total 40 columns):\n",
+            " #   Column                       Non-Null Count  Dtype  \n",
+            "---  ------                       --------------  -----  \n",
+            " 0   duration                     25192 non-null  int64  \n",
+            " 1   protocol_type                25192 non-null  int64  \n",
+            " 2   service                      25192 non-null  int64  \n",
+            " 3   flag                         25192 non-null  int64  \n",
+            " 4   src_bytes                    25192 non-null  int64  \n",
+            " 5   dst_bytes                    25192 non-null  int64  \n",
+            " 6   land                         25192 non-null  int64  \n",
+            " 7   wrong_fragment               25192 non-null  int64  \n",
+            " 8   hot                          25192 non-null  int64  \n",
+            " 9   num_failed_logins            25192 non-null  int64  \n",
+            " 10  logged_in                    25192 non-null  int64  \n",
+            " 11  num_compromised              25192 non-null  int64  \n",
+            " 12  root_shell                   25192 non-null  int64  \n",
+            " 13  su_attempted                 25192 non-null  int64  \n",
+            " 14  num_root                     25192 non-null  int64  \n",
+            " 15  num_file_creations           25192 non-null  int64  \n",
+            " 16  num_shells                   25192 non-null  int64  \n",
+            " 17  num_access_files             25192 non-null  int64  \n",
+            " 18  is_guest_login               25192 non-null  int64  \n",
+            " 19  count                        25192 non-null  int64  \n",
+            " 20  srv_count                    25192 non-null  int64  \n",
+            " 21  serror_rate                  25192 non-null  float64\n",
+            " 22  srv_serror_rate              25192 non-null  float64\n",
+            " 23  rerror_rate                  25192 non-null  float64\n",
+            " 24  srv_rerror_rate              25192 non-null  float64\n",
+            " 25  same_srv_rate                25192 non-null  float64\n",
+            " 26  diff_srv_rate                25192 non-null  float64\n",
+            " 27  srv_diff_host_rate           25192 non-null  float64\n",
+            " 28  dst_host_count               25192 non-null  int64  \n",
+            " 29  dst_host_srv_count           25192 non-null  int64  \n",
+            " 30  dst_host_same_srv_rate       25192 non-null  float64\n",
+            " 31  dst_host_diff_srv_rate       25192 non-null  float64\n",
+            " 32  dst_host_same_src_port_rate  25192 non-null  float64\n",
+            " 33  dst_host_srv_diff_host_rate  25192 non-null  float64\n",
+            " 34  dst_host_serror_rate         25192 non-null  float64\n",
+            " 35  dst_host_srv_serror_rate     25192 non-null  float64\n",
+            " 36  dst_host_rerror_rate         25192 non-null  float64\n",
+            " 37  dst_host_srv_rerror_rate     25192 non-null  float64\n",
+            " 38  class                        25192 non-null  int64  \n",
+            " 39  index_num                    25192 non-null  int64  \n",
+            "dtypes: float64(15), int64(25)\n",
+            "memory usage: 7.7 MB\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vzo6lf_G3QFN"
+      },
+      "source": [
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.preprocessing import MinMaxScaler"
+      ],
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ioY_BhsQ3Suc"
+      },
+      "source": [
+        "data_y = data[\"class\"]\n",
+        "data_X = data.drop(columns = [\"class\",\"index_num\"])"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Aoz6AkZa3_rU"
+      },
+      "source": [
+        "sc = MinMaxScaler()\n",
+        "_X = sc.fit_transform(data_X)"
+      ],
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LffojJ-C1tEY",
+        "outputId": "95782163-850d-4477-fae8-726c620762c9"
+      },
+      "source": [
+        "X_train, X_test, Y_train, Y_test = train_test_split(_X, data_y, test_size=0.33, random_state=42)\n",
+        "print(X_train.shape, X_test.shape)\n",
+        "print(Y_train.shape, Y_test.shape)"
+      ],
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "(16878, 38) (8314, 38)\n",
+            "(16878,) (8314,)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R1X0zCyN4qNT"
+      },
+      "source": [
+        "## **Naive Bayes**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xupFriQx2n6T"
+      },
+      "source": [
+        "from sklearn.naive_bayes import GaussianNB"
+      ],
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "bTXaZ-jf4Slk"
+      },
+      "source": [
+        "nb = GaussianNB()"
+      ],
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "N8lbZQaE4UD8",
+        "outputId": "e45bf357-97fe-4390-c610-60066780e685"
+      },
+      "source": [
+        "start_time = time.time()\n",
+        "nb.fit(X_train, Y_train.values.ravel())\n",
+        "end_time = time.time()\n",
+        "print(\"Training time: \",end_time-start_time)"
+      ],
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Training time:  0.012809514999389648\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Kf7YGTRd4WyR",
+        "outputId": "668668e8-3349-4ba4-a69a-40a6d3aa2a19"
+      },
+      "source": [
+        "start_time = time.time()\n",
+        "Y_test_pred = nb.predict(X_test)\n",
+        "end_time = time.time()\n",
+        "print(\"Testing time: \",end_time-start_time)"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Testing time:  0.012314796447753906\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Rr1BkXsS4bce",
+        "outputId": "96ee68e9-f9e2-410e-970b-e842eb347146"
+      },
+      "source": [
+        "print(\"Train score is:\", nb.score(X_train, Y_train))\n",
+        "print(\"Test score is:\",nb.score(X_test,Y_test))"
+      ],
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Train score is: 0.8958407394241024\n",
+            "Test score is: 0.9030550878037046\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IMUmcZRA4mp1"
+      },
+      "source": [
+        "NB's accuracy = 90.31%"
+      ]
+    }
+  ]
+}
\ No newline at end of file
+%% Cell type:markdown id: tags:
+<a href="https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5D_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+%% Cell type:code id: tags:
+``` 
+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import time
+```
+%% Cell type:code id: tags:
+``` 
+data = pd.read_csv('DM_data.csv')
+data.info()
+```
+%% Output
+    <class 'pandas.core.frame.DataFrame'>
+    RangeIndex: 25192 entries, 0 to 25191
+    Data columns (total 40 columns):
+     #   Column                       Non-Null Count  Dtype
+    ---  ------                       --------------  -----
+     0   duration                     25192 non-null  int64
+     1   protocol_type                25192 non-null  int64
+     2   service                      25192 non-null  int64
+     3   flag                         25192 non-null  int64
+     4   src_bytes                    25192 non-null  int64
+     5   dst_bytes                    25192 non-null  int64
+     6   land                         25192 non-null  int64
+     7   wrong_fragment               25192 non-null  int64
+     8   hot                          25192 non-null  int64
+     9   num_failed_logins            25192 non-null  int64
+     10  logged_in                    25192 non-null  int64
+     11  num_compromised              25192 non-null  int64
+     12  root_shell                   25192 non-null  int64
+     13  su_attempted                 25192 non-null  int64
+     14  num_root                     25192 non-null  int64
+     15  num_file_creations           25192 non-null  int64
+     16  num_shells                   25192 non-null  int64
+     17  num_access_files             25192 non-null  int64
+     18  is_guest_login               25192 non-null  int64
+     19  count                        25192 non-null  int64
+     20  srv_count                    25192 non-null  int64
+     21  serror_rate                  25192 non-null  float64
+     22  srv_serror_rate              25192 non-null  float64
+     23  rerror_rate                  25192 non-null  float64
+     24  srv_rerror_rate              25192 non-null  float64
+     25  same_srv_rate                25192 non-null  float64
+     26  diff_srv_rate                25192 non-null  float64
+     27  srv_diff_host_rate           25192 non-null  float64
+     28  dst_host_count               25192 non-null  int64
+     29  dst_host_srv_count           25192 non-null  int64
+     30  dst_host_same_srv_rate       25192 non-null  float64
+     31  dst_host_diff_srv_rate       25192 non-null  float64
+     32  dst_host_same_src_port_rate  25192 non-null  float64
+     33  dst_host_srv_diff_host_rate  25192 non-null  float64
+     34  dst_host_serror_rate         25192 non-null  float64
+     35  dst_host_srv_serror_rate     25192 non-null  float64
+     36  dst_host_rerror_rate         25192 non-null  float64
+     37  dst_host_srv_rerror_rate     25192 non-null  float64
+     38  class                        25192 non-null  int64
+     39  index_num                    25192 non-null  int64
+    dtypes: float64(15), int64(25)
+    memory usage: 7.7 MB
+%% Cell type:code id: tags:
+``` 
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+```
+%% Cell type:code id: tags:
+``` 
+data_y = data["class"]
+data_X = data.drop(columns = ["class","index_num"])
+```
+%% Cell type:code id: tags:
+``` 
+sc = MinMaxScaler()
+_X = sc.fit_transform(data_X)
+```
+%% Cell type:code id: tags:
+``` 
+X_train, X_test, Y_train, Y_test = train_test_split(_X, data_y, test_size=0.33, random_state=42)
+print(X_train.shape, X_test.shape)
+print(Y_train.shape, Y_test.shape)
+```
+%% Output
+    (16878, 38) (8314, 38)
+    (16878,) (8314,)
+%% Cell type:markdown id: tags:
+## **Naive Bayes**
+%% Cell type:code id: tags:
+``` 
+from sklearn.naive_bayes import GaussianNB
+```
+%% Cell type:code id: tags:
+``` 
+nb = GaussianNB()
+```
+%% Cell type:code id: tags:
+``` 
+start_time = time.time()
+nb.fit(X_train, Y_train.values.ravel())
+end_time = time.time()
+print("Training time: ",end_time-start_time)
+```
+%% Output
+    Training time:  0.012809514999389648
+%% Cell type:code id: tags:
+``` 
+start_time = time.time()
+Y_test_pred = nb.predict(X_test)
+end_time = time.time()
+print("Testing time: ",end_time-start_time)
+```
+%% Output
+    Testing time:  0.012314796447753906
+%% Cell type:code id: tags:
+``` 
+print("Train score is:", nb.score(X_train, Y_train))
+print("Test score is:",nb.score(X_test,Y_test))
+```
+%% Output
+    Train score is: 0.8958407394241024
+    Test score is: 0.9030550878037046
+%% Cell type:markdown id: tags:
+NB's accuracy = 90.31%