Add Accuracy etc..

36ede52e · 지수 · 3edf9fc1 · 36ede52e
Commit 36ede52e authored 3 years ago by 지수
--- a/[DM]Apriori.ipynb
+++ b/[DM]Apriori.ipynb
@@ -6,7 +6,7 @@
      "name": "[DM]Apriori.ipynb",
      "provenance": [],
      "collapsed_sections": [],
-      "authorship_tag": "ABX9TyMF242OqNw0l0NQIEHfC2Vp",
+      "authorship_tag": "ABX9TyPQv9I66rslo5RN/uXRNX/R",
      "include_colab_link": true
    },
    "kernelspec": {
@@ -2820,8 +2820,6 @@
        "               |   Prediction of Attack     |  Prediction of Non-Attack\n",
        "---\n",
        "    Attack     |    True Positive : 2020    |    False Negative : 1396\n",
-        "\n",
-        "\n",
        "---\n",
        "    Non-Attack |    False Positive : 1      |    True Negative : 4041\n",
        "\n",

 %% Cell type:markdown id: tags:
 <a href="https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5DApriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 %% Cell type:code id: tags:
 ``` 
 import os
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 import time
 ```
 %% Cell type:code id: tags:
 ``` 
 data = pd.read_csv('DM_data.csv')
 data.info()
 ```
 %% Output
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 25192 entries, 0 to 25191
    Data columns (total 40 columns):
     #   Column                       Non-Null Count  Dtype
    ---  ------                       --------------  -----
     0   duration                     25192 non-null  int64
     1   protocol_type                25192 non-null  int64
     2   service                      25192 non-null  int64
     3   flag                         25192 non-null  int64
     4   src_bytes                    25192 non-null  int64
     5   dst_bytes                    25192 non-null  int64
     6   land                         25192 non-null  int64
     7   wrong_fragment               25192 non-null  int64
     8   hot                          25192 non-null  int64
     9   num_failed_logins            25192 non-null  int64
     10  logged_in                    25192 non-null  int64
     11  num_compromised              25192 non-null  int64
     12  root_shell                   25192 non-null  int64
     13  su_attempted                 25192 non-null  int64
     14  num_root                     25192 non-null  int64
     15  num_file_creations           25192 non-null  int64
     16  num_shells                   25192 non-null  int64
     17  num_access_files             25192 non-null  int64
     18  is_guest_login               25192 non-null  int64
     19  count                        25192 non-null  int64
     20  srv_count                    25192 non-null  int64
     21  serror_rate                  25192 non-null  float64
     22  srv_serror_rate              25192 non-null  float64
     23  rerror_rate                  25192 non-null  float64
     24  srv_rerror_rate              25192 non-null  float64
     25  same_srv_rate                25192 non-null  float64
     26  diff_srv_rate                25192 non-null  float64
     27  srv_diff_host_rate           25192 non-null  float64
     28  dst_host_count               25192 non-null  int64
     29  dst_host_srv_count           25192 non-null  int64
     30  dst_host_same_srv_rate       25192 non-null  float64
     31  dst_host_diff_srv_rate       25192 non-null  float64
     32  dst_host_same_src_port_rate  25192 non-null  float64
     33  dst_host_srv_diff_host_rate  25192 non-null  float64
     34  dst_host_serror_rate         25192 non-null  float64
     35  dst_host_srv_serror_rate     25192 non-null  float64
     36  dst_host_rerror_rate         25192 non-null  float64
     37  dst_host_srv_rerror_rate     25192 non-null  float64
     38  class                        25192 non-null  int64
     39  index_num                    25192 non-null  int64
    dtypes: float64(15), int64(25)
    memory usage: 7.7 MB
 %% Cell type:code id: tags:
 ``` 
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
 ```
 %% Cell type:code id: tags:
 ``` 
 from mlxtend.frequent_patterns import apriori,association_rules
 ```
 %% Cell type:code id: tags:
 ``` 
 sc = StandardScaler()
 sc_data = sc.fit_transform(data)
 sc_df = pd.DataFrame(sc_data, columns=data.columns)
 sc_df.head(n=10)
 #StandardScaler로 data scaling
 ```
 %% Output
       duration  protocol_type  ...     class  index_num
    0 -0.113551      -0.444009  ... -0.934425  -1.731982
    1 -0.113551       1.325565  ... -0.934425  -1.731845
    2 -0.113551      -0.444009  ...  1.070177  -1.731707
    3 -0.113551      -0.444009  ... -0.934425  -1.731570
    4 -0.113551      -0.444009  ... -0.934425  -1.731432
    5 -0.113551      -0.444009  ...  1.070177  -1.731295
    6 -0.113551      -0.444009  ...  1.070177  -1.731157
    7 -0.113551      -0.444009  ...  1.070177  -1.731019
    8 -0.113551      -0.444009  ...  1.070177  -1.730882
    9 -0.113551      -0.444009  ...  1.070177  -1.730744
    [10 rows x 40 columns]
 %% Cell type:code id: tags:
 ``` 
 def encode_units(x):
  if x <= 0 :
    return 0
  if x > 0 :
    return 1
 train_df = sc_df.applymap(encode_units)
 train_df.head(n=10)
 #classification을 위해 scaling 시킨 data들을 음수면 0, 양수면 1로 encoding
 ```
 %% Output
       duration  protocol_type  service  ...  dst_host_srv_rerror_rate  class  index_num
    0         0              0        0  ...                         0      0          0
    1         0              1        1  ...                         0      0          0
    2         0              0        0  ...                         0      1          0
    3         0              0        1  ...                         0      0          0
    4         0              0        1  ...                         0      0          0
    5         0              0        0  ...                         1      1          0
    6         0              0        0  ...                         0      1          0
    7         0              0        0  ...                         0      1          0
    8         0              0        1  ...                         0      1          0
    9         0              0        0  ...                         0      1          0
    [10 rows x 40 columns]
 %% Cell type:code id: tags:
 ``` 
 data_X = train_df.drop(columns = ["index_num"])
 X_train, X_test = train_test_split(data_X, test_size=0.3, random_state=42)
 print(X_train.shape, X_test.shape)
 #train data와 test data를 7:3 의 비율로 split
 ```
 %% Output
    (17634, 39) (7558, 39)
 %% Cell type:code id: tags:
 ``` 
 df = pd.DataFrame(X_train, columns=data.drop(columns = ["index_num"]).columns)
 df.head()
 ```
 %% Output
           duration  protocol_type  ...  dst_host_srv_rerror_rate  class
    741           0              0  ...                         0      1
    411           0              0  ...                         1      1
    17841         0              0  ...                         1      1
    20962         0              1  ...                         0      1
    17790         0              0  ...                         0      1
    [5 rows x 39 columns]
 %% Cell type:code id: tags:
 ``` 
 frequent_itemsets = apriori( df, min_support = 0.27, use_colnames=True)
 result_desc = frequent_itemsets.sort_values(['support'],ascending =[False])
 result_desc
 ```
 %% Output
          support                                           itemsets
    7    0.642225                                   (dst_host_count)
    6    0.622547                                    (same_srv_rate)
    0    0.618634                                          (service)
    1    0.611773                                             (flag)
    20   0.567143                              (flag, same_srv_rate)
    ..        ...                                                ...
    75   0.273789  (class, dst_host_srv_serror_rate, dst_host_ser...
    94   0.273733  (class, dst_host_srv_serror_rate, serror_rate,...
    95   0.273676  (class, dst_host_srv_serror_rate, srv_serror_r...
    102  0.273676  (class, srv_serror_rate, serror_rate, dst_host...
    39   0.270727             (dst_host_count, dst_host_serror_rate)
    [104 rows x 2 columns]
 %% Cell type:code id: tags:
 ``` 
 rules = association_rules(result_desc , metric = "confidence" , min_threshold = 0.9)
 rules = rules.sort_values(['confidence','lift'], ascending=[False , False])
 rules
 ```
 %% Output
                                               antecedents  ...   conviction
    220            (srv_serror_rate, dst_host_serror_rate)  ...          inf
    250     (class, srv_serror_rate, dst_host_serror_rate)  ...          inf
    287  (dst_host_srv_serror_rate, srv_serror_rate, ds...  ...          inf
    339  (class, dst_host_srv_serror_rate, srv_serror_r...  ...          inf
    215            (dst_host_srv_serror_rate, serror_rate)  ...  3478.839061
    ..                                                 ...  ...          ...
    86                 (service, same_srv_rate, logged_in)  ...     5.405698
    197          (dst_host_same_srv_rate, flag, logged_in)  ...     6.667782
    75                          (service, flag, logged_in)  ...     5.075064
    152                (dst_host_same_srv_rate, logged_in)  ...     6.630236
    40                                         (logged_in)  ...     5.703116
    [367 rows x 9 columns]
 %% Cell type:code id: tags:
 ``` 
 rules_list = rules[rules['consequents'] == {"class"}]
 rules_list
 ```
 %% Output
                                               antecedents  ...  conviction
    314  (dst_host_srv_serror_rate, serror_rate, dst_ho...  ...  515.533900
    327  (dst_host_srv_serror_rate, srv_serror_rate, ds...  ...  515.427209
    341  (serror_rate, dst_host_srv_serror_rate, srv_se...  ...  515.427209
    246            (srv_serror_rate, dst_host_serror_rate)  ...  172.627039
    251  (serror_rate, srv_serror_rate, dst_host_serror...  ...  172.627039
    236        (dst_host_srv_serror_rate, srv_serror_rate)  ...  144.033685
    264            (dst_host_srv_serror_rate, serror_rate)  ...  143.915139
    269  (serror_rate, dst_host_srv_serror_rate, srv_se...  ...  143.885502
    241                (serror_rate, dst_host_serror_rate)  ...   81.235665
    308   (dst_host_srv_serror_rate, dst_host_serror_rate)  ...   52.044171
    233                         (dst_host_srv_serror_rate)  ...   32.834346
    228                     (serror_rate, srv_serror_rate)  ...   20.914077
    210                                  (srv_serror_rate)  ...   18.526555
    207                                      (serror_rate)  ...   15.038154
    232                             (dst_host_serror_rate)  ...   14.668640
    [15 rows x 9 columns]
 %% Cell type:code id: tags:
 ``` 
 col = rules_list['antecedents']
 col
 ```
 %% Output
    314    (dst_host_srv_serror_rate, serror_rate, dst_ho...
    327    (dst_host_srv_serror_rate, srv_serror_rate, ds...
    341    (serror_rate, dst_host_srv_serror_rate, srv_se...
    246              (srv_serror_rate, dst_host_serror_rate)
    251    (serror_rate, srv_serror_rate, dst_host_serror...
    236          (dst_host_srv_serror_rate, srv_serror_rate)
    264              (dst_host_srv_serror_rate, serror_rate)
    269    (serror_rate, dst_host_srv_serror_rate, srv_se...
    241                  (serror_rate, dst_host_serror_rate)
    308     (dst_host_srv_serror_rate, dst_host_serror_rate)
    233                           (dst_host_srv_serror_rate)
    228                       (serror_rate, srv_serror_rate)
    210                                    (srv_serror_rate)
    207                                        (serror_rate)
    232                               (dst_host_serror_rate)
    Name: antecedents, dtype: object
 %% Cell type:code id: tags:
 ``` 
 col.to_csv('./col_list.csv')
 ```
 %% Cell type:code id: tags:
 ``` 
 test = pd.DataFrame(X_test, columns=data.drop(columns = ["index_num"]).columns)
 test.head(n=10)
 ```
 %% Output
           duration  protocol_type  ...  dst_host_srv_rerror_rate  class
    19064         0              1  ...                         0      1
    11127         0              0  ...                         1      0
    6517          0              0  ...                         0      1
    2973          0              1  ...                         0      0
    13339         0              0  ...                         0      1
    19289         0              0  ...                         0      0
    2166          0              0  ...                         0      0
    5548          0              0  ...                         0      0
    10887         0              0  ...                         0      0
    2222          0              0  ...                         0      1
    [10 rows x 39 columns]
 %% Cell type:code id: tags:
 ``` 
 col = ['dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate']
 ```
 %% Cell type:code id: tags:
 ``` 
 idx_b = test[(test['dst_host_srv_serror_rate'] == 0) | (test['srv_serror_rate'] == 0) | (test['serror_rate'] == 0) | (test['dst_host_serror_rate'] == 0)].index
 test_df = test.drop(idx_b)
 idx_class = test_df[test_df['class'] == 0 ].index
 test_err = test_df.drop(idx_class)
 print(test_df.shape)
 print(test_err.shape)
 ```
 %% Output
    (2121, 39)
    (2120, 39)
 %% Cell type:code id: tags:
 ``` 
 idx_a = test[(test['dst_host_srv_serror_rate'] == 1) & (test['srv_serror_rate'] == 1) & (test['serror_rate'] == 1) & (test['dst_host_serror_rate'] == 1)].index
 test_df = test.drop(idx_a)
 idx_class = test_df[test_df['class'] == 1 ].index
 test_err = test_df.drop(idx_class)
 print(test_df.shape)
 print(test_err.shape)
 ```
 %% Output
    (5437, 39)
    (4041, 39)
 %% Cell type:markdown id: tags:
               |   Prediction of Attack     |  Prediction of Non-Attack
 ---
    Attack     |    True Positive : 2020    |    False Negative : 1396
 ---
    Non-Attack |    False Positive : 1      |    True Negative : 4041
 %% Cell type:markdown id: tags:
 **Apriori Test**
 *   Accuracy = 80.19%
 *   Precision = 99.95%
 *   Recall = 59.13%
 *   Fallout = 0.02%
 *   F-score = 74.3