diff --git a/experiment/DM project_logistic_kmeans.R b/experiment/DM project_logistic_kmeans.R new file mode 100644 index 0000000000000000000000000000000000000000..5e7b6acd4130a32a1cf896ba23cf82da1f20ea5c --- /dev/null +++ b/experiment/DM project_logistic_kmeans.R @@ -0,0 +1,285 @@ +library(dplyr) + +data <- read.csv("C:/Users/luinn27/Downloads/archive/Train_data.csv") +#data <- data.frame(data) + +data$protocol_type <- as.character(data$protocol_type) +data$service <- as.character((data$service)) +data$flag <- as.character(data$flag) + + +data$class[data$class=="normal"]<-0 +data$class[data$class=="anomaly"]<-1 + +data$protocol_type[data$protocol_type=="tcp"]<-0 +data$protocol_type[data$protocol_type=="udp"]<-1 +data$protocol_type[data$protocol_type=="icmp"]<-2 + +data$service[data$service=="ftp_data"]<-0 +data$service[data$service=="private"]<-1 +data$service[data$service=="http"]<-2 +data$service[data$service=="remote_job"]<-3 +data$service[data$service=="name"]<-4 +data$service[data$service=="netbios_ns"]<-5 +data$service[data$service=="eco_i"]<-6 +data$service[data$service=="finger"]<-7 +data$service[data$service=="domain_u"]<-8 +data$service[data$service=="uucp_path"]<-9 +data$service[data$service=="Z39_50"]<-10 +data$service[data$service=="auth"]<-11 +data$service[data$service=="bgp"]<-12 +data$service[data$service=="courier"]<-13 +data$service[data$service=="csnet_ns"]<-14 +data$service[data$service=="ctf"]<-15 +data$service[data$service=="daytime"]<-16 +data$service[data$service=="discard"]<-17 +data$service[data$service=="domain"]<-18 +data$service[data$service=="echo"]<-19 +data$service[data$service=="ecr_i"]<-20 +data$service[data$service=="efs"]<-21 +data$service[data$service=="exec"]<-22 +data$service[data$service=="ftp"]<-23 +data$service[data$service=="gopher"]<-24 +data$service[data$service=="hostnames"]<-25 +data$service[data$service=="http_443"]<-26 +data$service[data$service=="http_8001"]<-27 +data$service[data$service=="imap4"]<-28 +data$service[data$service=="IRC"]<-29 +data$service[data$service=="iso_tsap"]<-30 +data$service[data$service=="klogin"]<-31 +data$service[data$service=="kshell"]<-32 +data$service[data$service=="ldap"]<-33 +data$service[data$service=="link"]<-34 +data$service[data$service=="login"]<-35 +data$service[data$service=="mtp"]<-36 +data$service[data$service=="netbios_dgm"]<-37 +data$service[data$service=="netbios_ssn"]<-38 +data$service[data$service=="netstat"]<-39 +data$service[data$service=="nnsp"]<-40 +data$service[data$service=="nntp"]<-41 +data$service[data$service=="ntp_u"]<-42 +data$service[data$service=="other"]<-43 +data$service[data$service=="pm_dump"]<-44 +data$service[data$service=="pop_2"]<-45 +data$service[data$service=="pop_3"]<-46 +data$service[data$service=="printer"]<-47 +data$service[data$service=="red_i"]<-48 +data$service[data$service=="rje"]<-49 +data$service[data$service=="shell"]<-50 +data$service[data$service=="smtp"]<-51 +data$service[data$service=="sql_net"]<-52 +data$service[data$service=="ssh"]<-53 +data$service[data$service=="sunrpc"]<-54 +data$service[data$service=="supdup"]<-55 +data$service[data$service=="systat"]<-56 +data$service[data$service=="telnet"]<-57 +data$service[data$service=="tim_i"]<-58 +data$service[data$service=="time"]<-59 +data$service[data$service=="urh_i"]<-60 +data$service[data$service=="urp_i"]<-61 +data$service[data$service=="uucp"]<-62 +data$service[data$service=="vmnet"]<-63 +data$service[data$service=="whois"]<-64 +data$service[data$service=="X11"]<-65 + +table(data$flag) +data$flag[data$flag=="OTH"]<-0 +data$flag[data$flag=="REJ"]<-1 +data$flag[data$flag=="RSTOS0"]<-2 +data$flag[data$flag=="RSTR"]<-3 +data$flag[data$flag=="S0"]<-4 +data$flag[data$flag=="S1"]<-5 +data$flag[data$flag=="S2"]<-6 +data$flag[data$flag=="S3"]<-7 +data$flag[data$flag=="SF"]<-8 +data$flag[data$flag=="SH"]<-9 +data$flag[data$flag=="RSTO"]<-10 + +data$service <- as.factor(data$service) +data$flag <- as.factor(data$flag) +data$class <-as.factor(data$class) +#data<-na.omit(data) +table(data$service) + + +md <- glm(data=data, class~ .,family=binomial) +summary(md) +md_ <- step(md, test="LRT") +md_ + +table(data_origin$protocol_type) +table(data_origin$service) +table(data_origin$land) +summary(data_origin$srv_count) + +newdata <- data +newdata$service<-as.numeric(newdata$service) + +newdata$service[newdata$service==10]<-99 +newdata$service[newdata$service==12]<-99 +newdata$service[newdata$service==13]<-99 +newdata$service[newdata$service==14]<-99 +newdata$service[newdata$service==15]<-99 +newdata$service[newdata$service==16]<-99 +newdata$service[newdata$service==17]<-99 +newdata$service[newdata$service==19]<-99 +newdata$service[newdata$service==21]<-99 +newdata$service[newdata$service==22]<-99 +newdata$service[newdata$service==24]<-99 +newdata$service[newdata$service==25]<-99 +newdata$service[newdata$service==26]<-99 +newdata$service[newdata$service==27]<-99 +newdata$service[newdata$service==28]<-99 +newdata$service[newdata$service==29]<-99 +newdata$service[newdata$service==3]<-99 +newdata$service[newdata$service==30]<-99 +newdata$service[newdata$service==31]<-99 +newdata$service[newdata$service==32]<-99 +newdata$service[newdata$service==33]<-99 +newdata$service[newdata$service==34]<-99 +newdata$service[newdata$service==35]<-99 +newdata$service[newdata$service==36]<-99 +newdata$service[newdata$service==37]<-99 +newdata$service[newdata$service==38]<-99 +newdata$service[newdata$service==39]<-99 +newdata$service[newdata$service==4]<-99 +newdata$service[newdata$service==40]<-99 +newdata$service[newdata$service==41]<-99 +newdata$service[newdata$service==42]<-99 +newdata$service[newdata$service==44]<-99 +newdata$service[newdata$service==45]<-99 +newdata$service[newdata$service==47]<-99 +newdata$service[newdata$service==48]<-99 +newdata$service[newdata$service==49]<-99 +newdata$service[newdata$service==5]<-99 +newdata$service[newdata$service==52]<-99 +newdata$service[newdata$service==53]<-99 +newdata$service[newdata$service==54]<-99 +newdata$service[newdata$service==55]<-99 +newdata$service[newdata$service==56]<-99 +newdata$service[newdata$service==58]<-99 +newdata$service[newdata$service==59]<-99 +newdata$service[newdata$service==60]<-99 +newdata$service[newdata$service==61]<-99 +newdata$service[newdata$service==62]<-99 +newdata$service[newdata$service==63]<-99 +newdata$service[newdata$service==64]<-99 +newdata$service[newdata$service==9]<-99 + + + +table(newdata$service) +newdata$service[newdata$service==6]<-3 +newdata$service[newdata$service==7]<-4 +newdata$service[newdata$service==8]<-5 +newdata$service[newdata$service==11]<-6 +newdata$service[newdata$service==13]<-7 +newdata$service[newdata$service==14]<-8 +newdata$service[newdata$service==18]<-9 +newdata$service[newdata$service==20]<-10 +newdata$service[newdata$service==23]<-11 +newdata$service[newdata$service==43]<-12 +newdata$service[newdata$service==46]<-13 +newdata$service[newdata$service==50]<-14 +newdata$service[newdata$service==51]<-15 +newdata$service[newdata$service==57]<-16 +newdata$service[newdata$service==65]<-17 +newdata$service[newdata$service==66]<-18 +table(newdata$service) + +newdata$service <- as.factor(newdata$service) + + +#newdata <- subset(newdata, select=-c(num_outbound_cmds, is_host_login, urgent)) + +table(newdata$flag) +newdata$flag <- as.numeric(newdata$flag) +#newdata$flag[newdata$flag=="RSTO"]<-11 +newdata$flag <- as.factor(newdata$flag) + +new_md <- glm(data=newdata, class~., family=binomial) +summary(new_md) + +newdata$flag <- as.numeric(newdata$flag) +newdata$flag[newdata$flag==8]<-99 +newdata$flag[newdata$flag==3]<-99 +newdata$flag[newdata$flag==10]<-99 +table(newdata$flag) +newdata$flag[newdata$flag==4]<-3 +newdata$flag[newdata$flag==5]<-4 +newdata$flag[newdata$flag==6]<-5 +newdata$flag[newdata$flag==7]<-6 +newdata$flag[newdata$flag==9]<-7 +newdata$flag[newdata$flag==11]<-8 +newdata$flag[newdata$flag==99]<-9 +table(newdata$flag) +newdata$flag <- as.factor(newdata$flag) +new_md <- glm(data=newdata, class~., family=binomial) +summary(new_md) + +library(olsrr) +fixed_md <- step(new_md, test="LRT") +data_fix <- newdata %>% subset(data=newdata, select=c(num_root,logged_in,dst_host_srv_serror_rate,dst_bytes,num_failed_logins,is_guest_login,srv_count,land + ,serror_rate,src_bytes,duration,dst_host_diff_srv_rate,rerror_rate,dst_host_srv_diff_host_rate,diff_srv_rate, + srv_diff_host_rate,su_attempted,same_srv_rate,srv_serror_rate,dst_host_rerror_rate,srv_rerror_rate, + dst_host_same_srv_rate,num_compromised,dst_host_count,count,wrong_fragment,dst_host_srv_count, + dst_host_same_src_port_rate,protocol_type,flag,service,hot,class)) +library(data.table) +data_fix <- data.table(data_fix) +md_f <- glm(data=data_fix, class~., family="binomial") +summary(md_f) +shapiro.test(md_f$residuals) + +library(ROCR) +score = function(test,pred){ + confusion_matrix = table(test,pred) + + accuracy = sum(diag(confusion_matrix))/sum(confusion_matrix) + Precision = confusion_matrix[2,2]/sum(confusion_matrix[2,]) + Recall = confusion_matrix[2,2]/sum(confusion_matrix[,2]) + f1score = 2*(Precision*Recall)/(Precision+Recall) + cat("accuracy: ",accuracy,"\nPrecision: ",Precision,"\nRecall: ",Recall,"\n") + cat("F1-score: ",f1score,"\n") + + error_rate = 1 - accuracy + cat("error_rate:",error_rate,"\n") + + auc = performance(prediction(pred,test),measure = "auc") + cat("AUC: ", auc@y.values[[1]]) +} + +data_fixed<-data_fix %>% subset(data=data_fix, select=-c(srv_count,rerror_rate, su_attempted,wrong_fragment)) + + +data_fixed <- data_fixed %>% dplyr::mutate(index_num=row_number()) +tr <- sample_frac(data_fixed, size=0.7) +ts <- anti_join(data_fixed, tr, by=c('index_num')) +tr <- subset(tr, select=-index_num) +ts <- subset(ts, select=-index_num) + +model_to_predict <- glm(data=tr, class~., family=binomial) +summary(model_to_predict) +predict <- predict(model_to_predict,newdata=ts, type='response') +pred <- ifelse(predict<0.5,0,1) +test <- ts$class + +score(pred,test) + +write.csv(newdata, "G:/Project/DM_data.csv",row.names=F) + + +#### K-means + +library(caret) +library(ROCR) +tr_k <- subset(tr, select=-c(class)) +kmeans <- kmeans(tr_k, centers=2, iter.max=10000) +#kmeans$centers +modfit <- train(x=tr_k, y=tr$class, method="rpart") +predict <- predict(modfit, ts, type="prob") +predict <- ifelse(predict<0.5,0,1) +predict <- data.table(predict) +names(predict)[2]<-paste("conclusion") + +score(predict$conclusion, ts$class) +