Skip to content
Snippets Groups Projects
Unverified Commit 3cb1e084 authored by 지수's avatar 지수 Committed by GitHub
Browse files

Delete [DM]APriori.ipynb

parent 9415b528
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
<a href="https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5DApriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
%% Cell type:code id: tags:
```
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
```
%% Cell type:code id: tags:
```
data = pd.read_csv('DM_data.csv')
data.info()
```
%% Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 duration 25192 non-null int64
1 protocol_type 25192 non-null int64
2 service 25192 non-null int64
3 flag 25192 non-null int64
4 src_bytes 25192 non-null int64
5 dst_bytes 25192 non-null int64
6 land 25192 non-null int64
7 wrong_fragment 25192 non-null int64
8 hot 25192 non-null int64
9 num_failed_logins 25192 non-null int64
10 logged_in 25192 non-null int64
11 num_compromised 25192 non-null int64
12 root_shell 25192 non-null int64
13 su_attempted 25192 non-null int64
14 num_root 25192 non-null int64
15 num_file_creations 25192 non-null int64
16 num_shells 25192 non-null int64
17 num_access_files 25192 non-null int64
18 is_guest_login 25192 non-null int64
19 count 25192 non-null int64
20 srv_count 25192 non-null int64
21 serror_rate 25192 non-null float64
22 srv_serror_rate 25192 non-null float64
23 rerror_rate 25192 non-null float64
24 srv_rerror_rate 25192 non-null float64
25 same_srv_rate 25192 non-null float64
26 diff_srv_rate 25192 non-null float64
27 srv_diff_host_rate 25192 non-null float64
28 dst_host_count 25192 non-null int64
29 dst_host_srv_count 25192 non-null int64
30 dst_host_same_srv_rate 25192 non-null float64
31 dst_host_diff_srv_rate 25192 non-null float64
32 dst_host_same_src_port_rate 25192 non-null float64
33 dst_host_srv_diff_host_rate 25192 non-null float64
34 dst_host_serror_rate 25192 non-null float64
35 dst_host_srv_serror_rate 25192 non-null float64
36 dst_host_rerror_rate 25192 non-null float64
37 dst_host_srv_rerror_rate 25192 non-null float64
38 class 25192 non-null int64
39 index_num 25192 non-null int64
dtypes: float64(15), int64(25)
memory usage: 7.7 MB
%% Cell type:code id: tags:
```
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
```
%% Cell type:code id: tags:
```
from mlxtend.frequent_patterns import apriori,association_rules
```
%% Cell type:code id: tags:
```
sc = StandardScaler()
sc_data = sc.fit_transform(data)
sc_df = pd.DataFrame(sc_data, columns=data.columns)
sc_df.head(n=10)
#StandardScaler로 data scaling
```
%% Output
duration protocol_type ... class index_num
0 -0.113551 -0.444009 ... -0.934425 -1.731982
1 -0.113551 1.325565 ... -0.934425 -1.731845
2 -0.113551 -0.444009 ... 1.070177 -1.731707
3 -0.113551 -0.444009 ... -0.934425 -1.731570
4 -0.113551 -0.444009 ... -0.934425 -1.731432
5 -0.113551 -0.444009 ... 1.070177 -1.731295
6 -0.113551 -0.444009 ... 1.070177 -1.731157
7 -0.113551 -0.444009 ... 1.070177 -1.731019
8 -0.113551 -0.444009 ... 1.070177 -1.730882
9 -0.113551 -0.444009 ... 1.070177 -1.730744
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
def encode_units(x):
if x <= 0 :
return 0
if x > 0 :
return 1
train_df = sc_df.applymap(encode_units)
train_df.head(n=10)
#classification을 위해 scaling 시킨 data들을 음수면 0, 양수면 1로 encoding
```
%% Output
duration protocol_type service ... dst_host_srv_rerror_rate class index_num
0 0 0 0 ... 0 0 0
1 0 1 1 ... 0 0 0
2 0 0 0 ... 0 1 0
3 0 0 1 ... 0 0 0
4 0 0 1 ... 0 0 0
5 0 0 0 ... 1 1 0
6 0 0 0 ... 0 1 0
7 0 0 0 ... 0 1 0
8 0 0 1 ... 0 1 0
9 0 0 0 ... 0 1 0
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
data_X = train_df.drop(columns = ["index_num"])
X_train, X_test = train_test_split(data_X, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)
#train data와 test data를 7:3 의 비율로 split
```
%% Output
(17634, 39) (7558, 39)
%% Cell type:code id: tags:
```
df = pd.DataFrame(X_train, columns=data.drop(columns = ["index_num"]).columns)
df.head()
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
741 0 0 ... 0 1
411 0 0 ... 1 1
17841 0 0 ... 1 1
20962 0 1 ... 0 1
17790 0 0 ... 0 1
[5 rows x 39 columns]
%% Cell type:code id: tags:
```
frequent_itemsets = apriori( df, min_support = 0.27, use_colnames=True)
result_desc = frequent_itemsets.sort_values(['support'],ascending =[False])
result_desc
```
%% Output
support itemsets
7 0.642225 (dst_host_count)
6 0.622547 (same_srv_rate)
0 0.618634 (service)
1 0.611773 (flag)
20 0.567143 (flag, same_srv_rate)
.. ... ...
75 0.273789 (class, dst_host_srv_serror_rate, dst_host_ser...
94 0.273733 (class, dst_host_srv_serror_rate, serror_rate,...
95 0.273676 (class, dst_host_srv_serror_rate, srv_serror_r...
102 0.273676 (class, srv_serror_rate, serror_rate, dst_host...
39 0.270727 (dst_host_count, dst_host_serror_rate)
[104 rows x 2 columns]
%% Cell type:code id: tags:
```
rules = association_rules(result_desc , metric = "confidence" , min_threshold = 0.9)
rules = rules.sort_values(['confidence','lift'], ascending=[False , False])
rules
```
%% Output
antecedents ... conviction
220 (srv_serror_rate, dst_host_serror_rate) ... inf
250 (class, srv_serror_rate, dst_host_serror_rate) ... inf
287 (dst_host_srv_serror_rate, srv_serror_rate, ds... ... inf
339 (class, dst_host_srv_serror_rate, srv_serror_r... ... inf
215 (dst_host_srv_serror_rate, serror_rate) ... 3478.839061
.. ... ... ...
86 (service, same_srv_rate, logged_in) ... 5.405698
197 (dst_host_same_srv_rate, flag, logged_in) ... 6.667782
75 (service, flag, logged_in) ... 5.075064
152 (dst_host_same_srv_rate, logged_in) ... 6.630236
40 (logged_in) ... 5.703116
[367 rows x 9 columns]
%% Cell type:code id: tags:
```
rules_list = rules[rules['consequents'] == {"class"}]
rules_list
```
%% Output
antecedents ... conviction
314 (dst_host_srv_serror_rate, serror_rate, dst_ho... ... 515.533900
327 (dst_host_srv_serror_rate, srv_serror_rate, ds... ... 515.427209
341 (serror_rate, dst_host_srv_serror_rate, srv_se... ... 515.427209
246 (srv_serror_rate, dst_host_serror_rate) ... 172.627039
251 (serror_rate, srv_serror_rate, dst_host_serror... ... 172.627039
236 (dst_host_srv_serror_rate, srv_serror_rate) ... 144.033685
264 (dst_host_srv_serror_rate, serror_rate) ... 143.915139
269 (serror_rate, dst_host_srv_serror_rate, srv_se... ... 143.885502
241 (serror_rate, dst_host_serror_rate) ... 81.235665
308 (dst_host_srv_serror_rate, dst_host_serror_rate) ... 52.044171
233 (dst_host_srv_serror_rate) ... 32.834346
228 (serror_rate, srv_serror_rate) ... 20.914077
210 (srv_serror_rate) ... 18.526555
207 (serror_rate) ... 15.038154
232 (dst_host_serror_rate) ... 14.668640
[15 rows x 9 columns]
%% Cell type:code id: tags:
```
col = rules_list['antecedents']
col
```
%% Output
314 (dst_host_srv_serror_rate, serror_rate, dst_ho...
327 (dst_host_srv_serror_rate, srv_serror_rate, ds...
341 (serror_rate, dst_host_srv_serror_rate, srv_se...
246 (srv_serror_rate, dst_host_serror_rate)
251 (serror_rate, srv_serror_rate, dst_host_serror...
236 (dst_host_srv_serror_rate, srv_serror_rate)
264 (dst_host_srv_serror_rate, serror_rate)
269 (serror_rate, dst_host_srv_serror_rate, srv_se...
241 (serror_rate, dst_host_serror_rate)
308 (dst_host_srv_serror_rate, dst_host_serror_rate)
233 (dst_host_srv_serror_rate)
228 (serror_rate, srv_serror_rate)
210 (srv_serror_rate)
207 (serror_rate)
232 (dst_host_serror_rate)
Name: antecedents, dtype: object
%% Cell type:code id: tags:
```
col.to_csv('./col_list.csv')
```
%% Cell type:code id: tags:
```
test = pd.DataFrame(X_test, columns=data.drop(columns = ["index_num"]).columns)
test.head(n=10)
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
19064 0 1 ... 0 1
11127 0 0 ... 1 0
6517 0 0 ... 0 1
2973 0 1 ... 0 0
13339 0 0 ... 0 1
19289 0 0 ... 0 0
2166 0 0 ... 0 0
5548 0 0 ... 0 0
10887 0 0 ... 0 0
2222 0 0 ... 0 1
[10 rows x 39 columns]
%% Cell type:code id: tags:
```
col = ['dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate']
```
%% Cell type:code id: tags:
```
idx_b = test[(test['dst_host_srv_serror_rate'] == 0) | (test['srv_serror_rate'] == 0) | (test['serror_rate'] == 0) | (test['dst_host_serror_rate'] == 0)].index
test_df = test.drop(idx_b)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2121, 39)
(2120, 39)
%% Cell type:code id: tags:
```
idx_a = test[(test['dst_host_srv_serror_rate'] == 1) & (test['srv_serror_rate'] == 1) & (test['serror_rate'] == 1) & (test['dst_host_serror_rate'] == 1)].index
test_df = test.drop(idx_a)
idx_class = test_df[test_df['class'] == 1 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(5437, 39)
(4041, 39)
%% Cell type:markdown id: tags:
| Prediction of Attack | Prediction of Non-Attack
---
Attack | True Positive : 2020 | False Negative : 1396
---
Non-Attack | False Positive : 1 | True Negative : 4041
%% Cell type:markdown id: tags:
**Apriori Test**
* Accuracy = 80.19%
* Precision = 99.95%
* Recall = 59.13%
* Fallout = 0.02%
* F-score = 74.3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment