Skip to content
Snippets Groups Projects
Unverified Commit aa60bd86 authored by 지수's avatar 지수 Committed by GitHub
Browse files

Delete [DM]Apriori.ipynb

parent 50296fc7
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
<a href="https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5DApriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
%% Cell type:code id: tags:
```
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
```
%% Cell type:code id: tags:
```
data = pd.read_csv('DM_data.csv')
data.info()
```
%% Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 duration 25192 non-null int64
1 protocol_type 25192 non-null int64
2 service 25192 non-null int64
3 flag 25192 non-null int64
4 src_bytes 25192 non-null int64
5 dst_bytes 25192 non-null int64
6 land 25192 non-null int64
7 wrong_fragment 25192 non-null int64
8 hot 25192 non-null int64
9 num_failed_logins 25192 non-null int64
10 logged_in 25192 non-null int64
11 num_compromised 25192 non-null int64
12 root_shell 25192 non-null int64
13 su_attempted 25192 non-null int64
14 num_root 25192 non-null int64
15 num_file_creations 25192 non-null int64
16 num_shells 25192 non-null int64
17 num_access_files 25192 non-null int64
18 is_guest_login 25192 non-null int64
19 count 25192 non-null int64
20 srv_count 25192 non-null int64
21 serror_rate 25192 non-null float64
22 srv_serror_rate 25192 non-null float64
23 rerror_rate 25192 non-null float64
24 srv_rerror_rate 25192 non-null float64
25 same_srv_rate 25192 non-null float64
26 diff_srv_rate 25192 non-null float64
27 srv_diff_host_rate 25192 non-null float64
28 dst_host_count 25192 non-null int64
29 dst_host_srv_count 25192 non-null int64
30 dst_host_same_srv_rate 25192 non-null float64
31 dst_host_diff_srv_rate 25192 non-null float64
32 dst_host_same_src_port_rate 25192 non-null float64
33 dst_host_srv_diff_host_rate 25192 non-null float64
34 dst_host_serror_rate 25192 non-null float64
35 dst_host_srv_serror_rate 25192 non-null float64
36 dst_host_rerror_rate 25192 non-null float64
37 dst_host_srv_rerror_rate 25192 non-null float64
38 class 25192 non-null int64
39 index_num 25192 non-null int64
dtypes: float64(15), int64(25)
memory usage: 7.7 MB
%% Cell type:code id: tags:
```
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
```
%% Cell type:code id: tags:
```
from mlxtend.frequent_patterns import apriori,association_rules
```
%% Cell type:code id: tags:
```
sc = StandardScaler()
sc_data = sc.fit_transform(data)
sc_df = pd.DataFrame(sc_data, columns=data.columns)
sc_df.head(n=10)
```
%% Output
duration protocol_type ... class index_num
0 -0.113551 -0.444009 ... -0.934425 -1.731982
1 -0.113551 1.325565 ... -0.934425 -1.731845
2 -0.113551 -0.444009 ... 1.070177 -1.731707
3 -0.113551 -0.444009 ... -0.934425 -1.731570
4 -0.113551 -0.444009 ... -0.934425 -1.731432
5 -0.113551 -0.444009 ... 1.070177 -1.731295
6 -0.113551 -0.444009 ... 1.070177 -1.731157
7 -0.113551 -0.444009 ... 1.070177 -1.731019
8 -0.113551 -0.444009 ... 1.070177 -1.730882
9 -0.113551 -0.444009 ... 1.070177 -1.730744
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
def encode_units(x):
if x <= 0 :
return 0
if x >= 0 :
return 1
train_df = sc_df.applymap(encode_units)
train_df.head(n=10)
```
%% Output
duration protocol_type service ... dst_host_srv_rerror_rate class index_num
0 0 0 0 ... 0 0 0
1 0 1 1 ... 0 0 0
2 0 0 0 ... 0 1 0
3 0 0 1 ... 0 0 0
4 0 0 1 ... 0 0 0
5 0 0 0 ... 1 1 0
6 0 0 0 ... 0 1 0
7 0 0 0 ... 0 1 0
8 0 0 1 ... 0 1 0
9 0 0 0 ... 0 1 0
[10 rows x 40 columns]
%% Cell type:code id: tags:
```
data_X = train_df.drop(columns = ["index_num"])
X_train, X_test = train_test_split(data_X, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
```
%% Output
(16878, 39) (8314, 39)
%% Cell type:code id: tags:
```
df = pd.DataFrame(X_train, columns=data.drop(columns = ["index_num"]).columns)
df.head()
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
14666 0 0 ... 0 1
10743 0 0 ... 0 0
2487 0 0 ... 0 1
21251 0 0 ... 0 1
7387 0 1 ... 0 0
[5 rows x 39 columns]
%% Cell type:code id: tags:
```
frequent_itemsets = apriori( df, min_support = 0.01, use_colnames=True, max_len =2)
result_desc = frequent_itemsets.sort_values(['support'],ascending =[False])
result_desc
```
%% Output
support itemsets
18 0.642612 (dst_host_count)
15 0.621519 (same_srv_rate)
2 0.619327 (service)
3 0.610973 (flag)
90 0.566833 (flag, same_srv_rate)
.. ... ...
112 0.010724 (dst_bytes, dst_host_same_src_port_rate)
129 0.010606 (num_compromised, same_srv_rate)
38 0.010487 (dst_host_rerror_rate, duration)
85 0.010191 (num_compromised, flag)
40 0.010013 (class, duration)
[259 rows x 2 columns]
%% Cell type:code id: tags:
```
rules = association_rules(result_desc , metric = "confidence" , min_threshold = 0.90)
rules = rules.sort_values(['confidence','lift'], ascending=[False , False])
rules
```
%% Output
antecedents consequents ... leverage conviction
61 (num_compromised) (logged_in) ... 0.006515 inf
31 (protocol_type) (flag) ... 0.072559 inf
60 (num_compromised) (service) ... 0.004082 inf
37 (srv_count) (flag) ... 0.049968 94.706495
11 (srv_serror_rate) (serror_rate) ... 0.201858 163.236089
.. ... ... ... ... ...
9 (logged_in) (service) ... 0.116340 4.518958
53 (dst_bytes) (dst_host_same_srv_rate) ... 0.040676 5.903750
59 (src_bytes) (flag) ... 0.003853 4.422624
1 (same_srv_rate) (flag) ... 0.187101 4.421338
47 (srv_count) (dst_host_same_srv_rate) ... 0.052525 5.162086
[64 rows x 9 columns]
%% Cell type:code id: tags:
```
rules[rules['consequents'] == {"class"}]
```
%% Output
antecedents consequents ... leverage conviction
26 (dst_host_srv_serror_rate) (class) ... 0.144959 31.582765
20 (srv_serror_rate) (class) ... 0.143791 18.714676
17 (serror_rate) (class) ... 0.143106 15.042730
25 (dst_host_serror_rate) (class) ... 0.142372 14.499759
[4 rows x 9 columns]
%% Cell type:code id: tags:
```
test = pd.DataFrame(X_test, columns=data.drop(columns = ["index_num"]).columns)
test.head(n=10)
```
%% Output
duration protocol_type ... dst_host_srv_rerror_rate class
19064 0 1 ... 0 1
11127 0 0 ... 1 0
6517 0 0 ... 0 1
2973 0 1 ... 0 0
13339 0 0 ... 0 1
19289 0 0 ... 0 0
2166 0 0 ... 0 0
5548 0 0 ... 0 0
10887 0 0 ... 0 0
2222 0 0 ... 0 1
[10 rows x 39 columns]
%% Cell type:code id: tags:
```
col = ['dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate']
```
%% Cell type:code id: tags:
```
idx_1 = test[test['dst_host_srv_serror_rate'] == 0 ].index
test_df = test.drop(idx_1)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2348, 39)
(2317, 39)
%% Cell type:code id: tags:
```
idx_2 = test[test['srv_serror_rate'] == 0 ].index
test_df = test.drop(idx_2)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2403, 39)
(2320, 39)
%% Cell type:code id: tags:
```
idx_3 = test[test['serror_rate'] == 0 ].index
test_df = test.drop(idx_3)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2412, 39)
(2323, 39)
%% Cell type:code id: tags:
```
idx_4 = test[test['dst_host_serror_rate'] == 0 ].index
test_df = test.drop(idx_4)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2398, 39)
(2316, 39)
%% Cell type:code id: tags:
```
idx_a = test[(test['dst_host_srv_serror_rate'] == 0) & (test['srv_serror_rate'] == 0) & (test['serror_rate'] == 0) & (test['dst_host_serror_rate'] == 0)].index
test_df = test.drop(idx_a)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2499, 39)
(2338, 39)
%% Cell type:code id: tags:
```
idx_b = test[(test['dst_host_srv_serror_rate'] == 0) | (test['srv_serror_rate'] == 0) | (test['serror_rate'] == 0) | (test['dst_host_serror_rate'] == 0)].index
test_df = test.drop(idx_b)
idx_class = test_df[test_df['class'] == 0 ].index
test_err = test_df.drop(idx_class)
print(test_df.shape)
print(test_err.shape)
```
%% Output
(2301, 39)
(2300, 39)
%% Cell type:markdown id: tags:
*Apriori test accuracy = 97.03%*
%% Cell type:code id: tags:
```
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment