Skip to content
Snippets Groups Projects
Commit 0e10b95c authored by 지수's avatar 지수
Browse files

Accuracy update

parent 44fbba61
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
<a href="https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5DApriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
%% Cell type:code id: tags:
```
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
```
%% Cell type:code id: tags:
```
data = pd.read_csv('DM_data.csv')
data.info()
```
%% Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 duration 25192 non-null int64
1 protocol_type 25192 non-null int64
2 service 25192 non-null int64
3 flag 25192 non-null int64
4 src_bytes 25192 non-null int64
5 dst_bytes 25192 non-null int64
6 land 25192 non-null int64
7 wrong_fragment 25192 non-null int64
8 hot 25192 non-null int64
9 num_failed_logins 25192 non-null int64
10 logged_in 25192 non-null int64
11 num_compromised 25192 non-null int64
12 root_shell 25192 non-null int64
13 su_attempted 25192 non-null int64
14 num_root 25192 non-null int64
15 num_file_creations 25192 non-null int64
16 num_shells 25192 non-null int64
17 num_access_files 25192 non-null int64
18 is_guest_login 25192 non-null int64
19 count 25192 non-null int64
20 srv_count 25192 non-null int64
21 serror_rate 25192 non-null float64
22 srv_serror_rate 25192 non-null float64
23 rerror_rate 25192 non-null float64
24 srv_rerror_rate 25192 non-null float64
25 same_srv_rate 25192 non-null float64
26 diff_srv_rate 25192 non-null float64
27 srv_diff_host_rate 25192 non-null float64
28 dst_host_count 25192 non-null int64
29 dst_host_srv_count 25192 non-null int64
30 dst_host_same_srv_rate 25192 non-null float64
31 dst_host_diff_srv_rate 25192 non-null float64
32 dst_host_same_src_port_rate 25192 non-null float64
33 dst_host_srv_diff_host_rate 25192 non-null float64
34 dst_host_serror_rate 25192 non-null float64
35 dst_host_srv_serror_rate 25192 non-null float64
36 dst_host_rerror_rate 25192 non-null float64
37 dst_host_srv_rerror_rate 25192 non-null float64
38 class 25192 non-null int64
39 index_num 25192 non-null int64
dtypes: float64(15), int64(25)
memory usage: 7.7 MB
%% Cell type:code id: tags:
```
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
```
%% Cell type:code id: tags:
```
from mlxtend.frequent_patterns import apriori,association_rules
```
%% Cell type:code id: tags:
```
sc = StandardScaler()
sc_data = sc.fit_transform(data)
sc_df = pd.DataFrame(sc_data, columns=data.columns)
sc_df.head(n=10)
#StandardScaler로 data scaling
```
%% Output
duration protocol_type ... class index_num
0 -0.113551 -0.444009 ... -0.934425 -1.731982
1 -0.113551 1.325565 ... -0.934425 -1.731845
2 -0.113551 -0.444009 ... 1.070177 -1.731707
3 -0.113551 -0.444009 ... -0.934425 -1.731570
4 -0.113551 -0.444009 ... -0.934425 -1.731432
5 -0.113551 -0.444009 ... 1.070177 -1.731295
6 -0.113551 -0.444009 ... 1.070177 -1.731157
7 -0.113551 -0.444009 ... 1.070177 -1.731019
8 -0.113551 -0.444009 ... 1.070177 -1.730882
9 -0.113551 -0.444009 ... 1.070177 -1.730744
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
def encode_units(x):
if x <= 0 :
return 0
if x > 0 :
return 1
train_df = sc_df.applymap(encode_units)
train_df.head(n=10)
#classification을 위해 scaling 시킨 data들을 음수면 0, 양수면 1로 encoding
```
%% Output
duration protocol_type service ... dst_host_srv_rerror_rate class index_num
0 0 0 0 ... 0 0 0
1 0 1 1 ... 0 0 0
2 0 0 0 ... 0 1 0
3 0 0 1 ... 0 0 0
4 0 0 1 ... 0 0 0
5 0 0 0 ... 1 1 0
6 0 0 0 ... 0 1 0
7 0 0 0 ... 0 1 0
8 0 0 1 ... 0 1 0
9 0 0 0 ... 0 1 0
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
data_X = train_df.drop(columns = ["index_num"])
X_train, X_test = train_test_split(data_X, test_size=0.3, shuffle=True)
print(X_train.shape, X_test.shape)
#train data와 test data를 7:3 의 비율로 split
```
%% Output
(17634, 39) (7558, 39)
%% Cell type:code id: tags:
```
df = pd.DataFrame(X_train, columns=data.drop(columns = ["index_num"]).columns)
df.head()
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
21843 0 1 ... 0 1
10713 0 0 ... 0 1
1773 0 0 ... 0 0
6797 0 1 ... 0 0
23799 0 0 ... 0 1
[5 rows x 39 columns]
%% Cell type:code id: tags:
```
frequent_itemsets = apriori( df, min_support = 0.1, use_colnames=True, max_len = 2)
result_desc = frequent_itemsets.sort_values(['support'],ascending =[False])
result_desc
```
%% Output
support itemsets
13 0.644267 (dst_host_count)
10 0.623001 (same_srv_rate)
1 0.617444 (service)
2 0.609221 (flag)
45 0.568334 (flag, same_srv_rate)
.. ... ...
79 0.101962 (class, rerror_rate)
82 0.101565 (class, srv_rerror_rate)
27 0.101225 (protocol_type, dst_host_srv_count)
29 0.100488 (protocol_type, dst_host_same_src_port_rate)
103 0.100374 (class, dst_host_same_src_port_rate)
[109 rows x 2 columns]
%% Cell type:code id: tags:
```
rules = association_rules(result_desc , metric = "confidence" , min_threshold = 0.8)
rules = rules.sort_values(['confidence','lift'], ascending=[False , False])
rules
```
%% Output
antecedents consequents ... leverage conviction
40 (protocol_type) (flag) ... 0.072199 inf
50 (srv_count) (flag) ... 0.050880 100.690768
17 (srv_serror_rate) (serror_rate) ... 0.201427 170.141448
20 (dst_host_srv_serror_rate) (dst_host_serror_rate) ... 0.196945 74.892007
51 (srv_count) (same_srv_rate) ... 0.048274 38.011332
.. ... ... ... ... ...
37 (dst_host_serror_rate) (count) ... 0.127893 3.391583
68 (rerror_rate) (class) ... 0.043448 2.846193
36 (serror_rate) (count) ... 0.127778 3.349567
69 (srv_rerror_rate) (class) ... 0.042602 2.711262
13 (dst_host_srv_count) (service) ... 0.080245 1.929109
[70 rows x 9 columns]
%% Cell type:code id: tags:
```
rules_list = rules[rules['consequents'] == {"class"}]
rules_list
```
%% Output
antecedents consequents ... leverage conviction
31 (dst_host_srv_serror_rate) (class) ... 0.144706 33.714680
30 (srv_serror_rate) (class) ... 0.142795 17.141318
29 (dst_host_serror_rate) (class) ... 0.142164 14.850403
22 (serror_rate) (class) ... 0.142093 13.915800
15 (count) (class) ... 0.153262 4.692114
48 (diff_srv_rate) (class) ... 0.066509 3.727503
68 (rerror_rate) (class) ... 0.043448 2.846193
69 (srv_rerror_rate) (class) ... 0.042602 2.711262
[8 rows x 9 columns]
%% Cell type:code id: tags:
```
test = pd.DataFrame(X_test, columns=data.drop(columns = ["index_num"]).columns)
test.head(n=10)
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
3445 0 0 ... 0 0
17461 0 0 ... 0 0
14662 0 0 ... 0 1
4043 0 0 ... 1 1
9161 0 0 ... 0 0
3396 0 0 ... 1 1
16768 0 0 ... 0 0
19271 0 0 ... 0 0
11404 0 0 ... 0 0
10421 0 0 ... 0 0
[10 rows x 39 columns]
%% Cell type:code id: tags:
```
col = ['dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate', 'count', 'diff_srv_rate', 'rerror_rate', 'srv_rerror_rate']
```
%% Cell type:code id: tags:
```
idx_a = test[(test['dst_host_srv_serror_rate'] == 0) & (test['srv_serror_rate'] == 0) & (test['serror_rate'] == 0) & (test['dst_host_serror_rate'] == 0) & (test['count'] == 0) & (test['diff_srv_rate'] == 0) & (test['rerror_rate'] == 0) & (test['srv_rerror_rate'] == 0) ].index
test_df = test.drop(idx_a) #attack 예측
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class) #attack 결과
print(test_df.shape)
print(test_err.shape)
```
%% Output
(3927, 39)
(3072, 39)
%% Cell type:code id: tags:
```
idx_a = test[(test['dst_host_srv_serror_rate'] == 1) | (test['srv_serror_rate'] == 1) | (test['serror_rate'] == 1) | (test['dst_host_serror_rate'] == 1) | (test['count'] == 1) | (test['diff_srv_rate'] == 1) | (test['rerror_rate'] == 1) | (test['srv_rerror_rate'] == 1)].index
test_df = test.drop(idx_a) #non 이라고 예측
idx_class = test_df[test_df['class'] == 1 ].index
test_err = test_df.drop(idx_class) #non 결과
print(test_df.shape)
print(test_err.shape)
```
%% Output
(3631, 39)
(3182, 39)
%% Cell type:markdown id: tags:
min_support = 0.1 / max_len = 2 / min_threshold = 0.8
('dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate', 'count', 'diff_srv_rate', 'rerror_rate', 'srv_rerror_rate')
| Prediction of Attack | Prediction of Non-Attack
---
Attack | True Positive : 3072 | False Negative : 449
---
Non-Attack | False Positive : 855 | True Negative : 3182
**Apriori Test**
* Accuracy (82.75)
* Precision (78.23)
* Recall (87.25)
* F1 score (82.49)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment