diff --git a/[DM]_Naive_Bayes.ipynb b/[DM]_Naive_Bayes.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..efef969a2a863e6b25f6a8b176b9529d9512028e --- /dev/null +++ b/[DM]_Naive_Bayes.ipynb @@ -0,0 +1,297 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "[DM] Naive_Bayes.ipynb", + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyNQ0EbVS6CTLuq4m8RIc+fn", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "<a href=\"https://colab.research.google.com/github/lani009/IDS-DataMining/blob/main/%5BDM%5D_Naive_Bayes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "p_S1iryH1NBB" + }, + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import time" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "S3PZhNLC1daY", + "outputId": "0a4f496e-8485-441d-eff6-d7bc67c3886e" + }, + "source": [ + "data = pd.read_csv('DM_data.csv')\n", + "data.info()" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 25192 entries, 0 to 25191\n", + "Data columns (total 40 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 duration 25192 non-null int64 \n", + " 1 protocol_type 25192 non-null int64 \n", + " 2 service 25192 non-null int64 \n", + " 3 flag 25192 non-null int64 \n", + " 4 src_bytes 25192 non-null int64 \n", + " 5 dst_bytes 25192 non-null int64 \n", + " 6 land 25192 non-null int64 \n", + " 7 wrong_fragment 25192 non-null int64 \n", + " 8 hot 25192 non-null int64 \n", + " 9 num_failed_logins 25192 non-null int64 \n", + " 10 logged_in 25192 non-null int64 \n", + " 11 num_compromised 25192 non-null int64 \n", + " 12 root_shell 25192 non-null int64 \n", + " 13 su_attempted 25192 non-null int64 \n", + " 14 num_root 25192 non-null int64 \n", + " 15 num_file_creations 25192 non-null int64 \n", + " 16 num_shells 25192 non-null int64 \n", + " 17 num_access_files 25192 non-null int64 \n", + " 18 is_guest_login 25192 non-null int64 \n", + " 19 count 25192 non-null int64 \n", + " 20 srv_count 25192 non-null int64 \n", + " 21 serror_rate 25192 non-null float64\n", + " 22 srv_serror_rate 25192 non-null float64\n", + " 23 rerror_rate 25192 non-null float64\n", + " 24 srv_rerror_rate 25192 non-null float64\n", + " 25 same_srv_rate 25192 non-null float64\n", + " 26 diff_srv_rate 25192 non-null float64\n", + " 27 srv_diff_host_rate 25192 non-null float64\n", + " 28 dst_host_count 25192 non-null int64 \n", + " 29 dst_host_srv_count 25192 non-null int64 \n", + " 30 dst_host_same_srv_rate 25192 non-null float64\n", + " 31 dst_host_diff_srv_rate 25192 non-null float64\n", + " 32 dst_host_same_src_port_rate 25192 non-null float64\n", + " 33 dst_host_srv_diff_host_rate 25192 non-null float64\n", + " 34 dst_host_serror_rate 25192 non-null float64\n", + " 35 dst_host_srv_serror_rate 25192 non-null float64\n", + " 36 dst_host_rerror_rate 25192 non-null float64\n", + " 37 dst_host_srv_rerror_rate 25192 non-null float64\n", + " 38 class 25192 non-null int64 \n", + " 39 index_num 25192 non-null int64 \n", + "dtypes: float64(15), int64(25)\n", + "memory usage: 7.7 MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vzo6lf_G3QFN" + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import MinMaxScaler" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ioY_BhsQ3Suc" + }, + "source": [ + "data_y = data[\"class\"]\n", + "data_X = data.drop(columns = [\"class\",\"index_num\"])" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Aoz6AkZa3_rU" + }, + "source": [ + "sc = MinMaxScaler()\n", + "_X = sc.fit_transform(data_X)" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LffojJ-C1tEY", + "outputId": "95782163-850d-4477-fae8-726c620762c9" + }, + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(_X, data_y, test_size=0.33, random_state=42)\n", + "print(X_train.shape, X_test.shape)\n", + "print(Y_train.shape, Y_test.shape)" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(16878, 38) (8314, 38)\n", + "(16878,) (8314,)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R1X0zCyN4qNT" + }, + "source": [ + "## **Naive Bayes**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xupFriQx2n6T" + }, + "source": [ + "from sklearn.naive_bayes import GaussianNB" + ], + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "bTXaZ-jf4Slk" + }, + "source": [ + "nb = GaussianNB()" + ], + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N8lbZQaE4UD8", + "outputId": "e45bf357-97fe-4390-c610-60066780e685" + }, + "source": [ + "start_time = time.time()\n", + "nb.fit(X_train, Y_train.values.ravel())\n", + "end_time = time.time()\n", + "print(\"Training time: \",end_time-start_time)" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training time: 0.012809514999389648\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Kf7YGTRd4WyR", + "outputId": "668668e8-3349-4ba4-a69a-40a6d3aa2a19" + }, + "source": [ + "start_time = time.time()\n", + "Y_test_pred = nb.predict(X_test)\n", + "end_time = time.time()\n", + "print(\"Testing time: \",end_time-start_time)" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Testing time: 0.012314796447753906\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Rr1BkXsS4bce", + "outputId": "96ee68e9-f9e2-410e-970b-e842eb347146" + }, + "source": [ + "print(\"Train score is:\", nb.score(X_train, Y_train))\n", + "print(\"Test score is:\",nb.score(X_test,Y_test))" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train score is: 0.8958407394241024\n", + "Test score is: 0.9030550878037046\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IMUmcZRA4mp1" + }, + "source": [ + "NB's accuracy = 90.31%" + ] + } + ] +} \ No newline at end of file