diff --git a/Machine_Learning.ipynb b/Machine_Learning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..98e6fa1e753c9d85375ae72dc40364e5fc028ace --- /dev/null +++ b/Machine_Learning.ipynb @@ -0,0 +1,799 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 구글 드라이브 연결 및 데이터 불러오기" + ], + "metadata": { + "id": "iN0i-sXwwJzr" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jcYdSR0kD-Py" + }, + "outputs": [], + "source": [ + "# 본인 구글드라이브 연결\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "source": [ + "cd drive/MyDrive/'Colab Notebooks'/데이터마이닝/" + ], + "metadata": { + "id": "faQ8kn45EcaS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "\n", + "# 데이터 불러오기\n", + "data=pd.read_csv('chatgpt_paraphrases.csv')" + ], + "metadata": { + "id": "2DVrCwEvwggM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data.head()\n", + "data.dtypes\n", + "data" + ], + "metadata": { + "id": "utnWdjs-wtIQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 데이터 전처리 및 테스트 셋 분류" + ], + "metadata": { + "id": "qRtCapbgw6y1" + } + }, + { + "cell_type": "code", + "source": [ + "# 필요한 데이터 뽑아내기 (Collecting Nescessary Data)\n", + "category={}\n", + "for i in range(len(data)):\n", + " chatgpt=data.iloc[i][\"paraphrases\"][1:-1].split(', ')\n", + " for j in chatgpt[:1]:\n", + " category[j[1:-1]]='chatgpt'\n", + " category[data.iloc[i]['text']]=\"human\"\n", + "\n", + "# 데이터프레임 형식으로 바꾸기 (Converting Dictionary\n", + "data=pd.DataFrame(category.items(),columns=[\"text\",\"category\"])\n", + "data=data.sample(frac=1)\n", + "\n", + "data\n", + "\n", + "data[\"category\"].value_counts()" + ], + "metadata": { + "id": "P7j6MMy8w-6i" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# 테스트 셋 분류\n", + "X=data['text']\n", + "y=data['category']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "\n", + "# Tfidf를 사용하여 벡터화 (Vectorizing Using Tfidf)\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "vectorizer = TfidfVectorizer()\n", + "X_train_tfidf = vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = vectorizer.transform(X_test)" + ], + "metadata": { + "id": "0fJxwQVlyjkr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 모델 선정" + ], + "metadata": { + "id": "_nafbNL_M4Ub" + } + }, + { + "cell_type": "markdown", + "source": [ + "모델 후보들\n", + "\n", + "KNN(K Nearest Neighbor),\n", + "SVC(Support Vector Machines),\n", + "RFC(Random Forest Classifier),\n", + "DTC(Decision Tree Classifier)\n", + "\n", + "를 사용할 계획이다." + ], + "metadata": { + "id": "v5uzNIPYx_yo" + } + }, + { + "cell_type": "markdown", + "source": [ + "## KNN(K Nearest Neighbor)" + ], + "metadata": { + "id": "fgK95wGDyRWi" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}\n", + "knn = KNeighborsClassifier()\n", + "grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " knn_model = KNeighborsClassifier(**params)\n", + " knn_model.fit(X_train_tfidf, y_train)\n", + " y_pred = knn_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_knn = grid_search.best_estimator_\n", + "y_pred = best_knn.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "8_W7qSlNb1Hg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "n_neighbors 숫자가 커질수록 mcc 값이 커지는것을 확인" + ], + "metadata": { + "id": "aoq-RQmGCAOX" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'n_neighbors': list(range(1,101)), 'weights': ['uniform', 'distance']}\n", + "knn = KNeighborsClassifier()\n", + "grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " knn_model = KNeighborsClassifier(**params)\n", + " knn_model.fit(X_train_tfidf, y_train)\n", + " y_pred = knn_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_knn = grid_search.best_estimator_\n", + "y_pred = best_knn.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "0CGjPHCkby2q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Best Hyperparameters: {'n_neighbors': 43, 'weights': 'distance'}\n", + "\n", + "Evaluation Metrics:\n", + "Accuracy: 0.72\n", + "Precision: 0.7218863015823609\n", + "Recall: 0.72\n", + "F1 Score: 0.7190533965955703\n", + "Matthews Correlation Coefficient: 0.4413253711772089" + ], + "metadata": { + "id": "srDAcfxLB74i" + } + }, + { + "cell_type": "markdown", + "source": [ + "## SVC(Support Vector Machines)" + ], + "metadata": { + "id": "IK5bBh1z00PG" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf']}\n", + "svc = SVC()\n", + "grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " svc_model = SVC(**params)\n", + " svc_model.fit(X_train_tfidf, y_train)\n", + " y_pred = svc_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_svc = grid_search.best_estimator_\n", + "y_pred = best_svc.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "8yBxUmHC06bR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}\n", + "\n", + "Evaluation Metrics:\n", + "Accuracy: 0.7625\n", + "Precision: 0.7636107310458006\n", + "Recall: 0.7625\n", + "F1 Score: 0.7619170211585666\n", + "Matthews Correlation Coefficient: 0.5253533759188264" + ], + "metadata": { + "id": "IpOyMN_ZpI18" + } + }, + { + "cell_type": "markdown", + "source": [ + "## RFC(Random Forest Classifier)" + ], + "metadata": { + "id": "T1XYF1NDpOvg" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}\n", + "rfc = RandomForestClassifier()\n", + "grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " rfc_model = RandomForestClassifier(**params)\n", + " rfc_model.fit(X_train_tfidf, y_train)\n", + " y_pred = rfc_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_rfc = grid_search.best_estimator_\n", + "y_pred = best_rfc.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "e8-EauavpWGM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## DTC(Decision Tree Classifier)" + ], + "metadata": { + "id": "6K6-8qFbpktL" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}\n", + "dtc = DecisionTreeClassifier()\n", + "grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " dtc_model = DecisionTreeClassifier(**params)\n", + " dtc_model.fit(X_train_tfidf, y_train)\n", + " y_pred = dtc_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_dtc = grid_search.best_estimator_\n", + "y_pred = best_dtc.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "-MclX444pkCF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 최종 선택한 모델 교차 검증 수행" + ], + "metadata": { + "id": "6EgVN-BHGjjo" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import metrics\n", + "\n", + "# Hyperparameter tuning using GridSearchCV\n", + "param_grid = {'n_estimators': [200], 'max_depth': [None], 'min_samples_split': [5], 'min_samples_leaf': [4]}\n", + "rfc = RandomForestClassifier()\n", + "grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')\n", + "grid_search.fit(X_train_tfidf, y_train)\n", + "\n", + "# Print the results for each hyperparameter combination\n", + "results = grid_search.cv_results_\n", + "for mean_score, params in zip(results['mean_test_score'], results['params']):\n", + " rfc_model = RandomForestClassifier(**params)\n", + " rfc_model.fit(X_train_tfidf, y_train)\n", + " y_pred = rfc_model.predict(X_test_tfidf)\n", + "\n", + " # Calculate precision, recall, f1-score, and Matthews correlation coefficient\n", + " pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + " rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + " f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + " mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + " print(f\"Hyperparameters: {params}\")\n", + " print(\" Accuracy:\", mean_score)\n", + " print(\" Precision:\", pre)\n", + " print(\" Recall:\", rec)\n", + " print(\" F1 Score:\", f1)\n", + " print(\" Matthews Correlation Coefficient:\", mcc)\n", + " print(\"-----------\")\n", + "\n", + "# Print the best hyperparameters\n", + "print(\"\\nBest Hyperparameters:\", grid_search.best_params_)\n", + "\n", + "# Evaluate the model with the best hyperparameters on the test set\n", + "best_rfc = grid_search.best_estimator_\n", + "y_pred = best_rfc.predict(X_test_tfidf)\n", + "\n", + "# Calculate accuracy, precision, recall, f1-score, and Matthews correlation coefficient\n", + "acc = metrics.accuracy_score(y_test, y_pred)\n", + "pre = metrics.precision_score(y_test, y_pred, average='weighted')\n", + "rec = metrics.recall_score(y_test, y_pred, average='weighted')\n", + "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n", + "mcc = metrics.matthews_corrcoef(y_test, y_pred)\n", + "\n", + "# Print the evaluation metrics\n", + "print(\"\\nEvaluation Metrics:\")\n", + "print(\"Accuracy:\", acc)\n", + "print(\"Precision:\", pre)\n", + "print(\"Recall:\", rec)\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Matthews Correlation Coefficient:\", mcc)\n" + ], + "metadata": { + "id": "LhvIh4Z4Gi5V" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import cross_val_score, KFold\n", + "\n", + "# Define the selected Random Forest Classifier with the best hyperparameters\n", + "best_rfc = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1)\n", + "\n", + "# Define the number of folds for K-fold cross-validation\n", + "n_folds = 5\n", + "\n", + "# Create a KFold object\n", + "kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)\n", + "\n", + "# Perform K-fold cross-validation\n", + "cv_results = cross_val_score(best_rfc, X_train_tfidf, y_train, cv=kf, scoring='accuracy')\n", + "\n", + "# Print the accuracy for each fold\n", + "for i, acc in enumerate(cv_results, start=1):\n", + " print(f\"Fold {i} Accuracy: {acc:.3f}\")\n", + "\n", + "# Print the average accuracy across all folds\n", + "print(f\"\\nAverage Accuracy across {n_folds} Folds: {cv_results.mean():.3f}\")\n" + ], + "metadata": { + "id": "ybNR8dB3IvyN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rfc.fit(X_train_tfidf,y_train)" + ], + "metadata": { + "id": "a6Ewr-8zPY1o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 결과 Confusion & ROC" + ], + "metadata": { + "id": "RqnkhfwfKepf" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "y_pred =rfc.predict(X_test_tfidf)\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)\n", + "y_test.value_counts()\n", + "\n", + "import seaborn as sn\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "df_cm = pd.DataFrame(cm, index = [i for i in [\"ChatGPT\",\"Human\"]],\n", + " columns = [i for i in [\"ChatGPT\",\"Human\"]])\n", + "plt.figure(figsize = (10,7))\n", + "sn.heatmap(df_cm, annot=True,cmap=\"YlGnBu\", fmt='g')" + ], + "metadata": { + "id": "n96jTKA6KcwS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import roc_curve,auc\n", + "y_prob = rfc.predict_proba(X_test_tfidf)[:, 1]\n", + "\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label='human')\n", + "\n", + "# Calculate the area under the ROC curve\n", + "roc_auc = auc(fpr, tpr)\n", + "\n", + "# Plot the ROC curve\n", + "plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)\n", + "plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver Operating Characteristic')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ], + "metadata": { + "id": "YTuhwiPmPyYo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 사용해보기" + ], + "metadata": { + "id": "SOlX7rEtJT5X" + } + }, + { + "cell_type": "code", + "source": [ + "def predict_text_category(model, text):\n", + " text_vectorized = vectorizer.transform([text])\n", + " prediction_prob = model.predict_proba(text_vectorized)\n", + " predicted_class_idx = np.argmax(prediction_prob)\n", + " unique_class_labels = np.unique(y_train)\n", + " predicted_category = unique_class_labels[predicted_class_idx]\n", + " return predicted_category" + ], + "metadata": { + "id": "-A0kdamK8H1s" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text_to_predict = \"Waves pound across a stone jetty. A MAN sits fishing whilehis young son, BRANDO strolls toward the open sea. He pokesat rocks and seaweed with a fishing pole. He glances down atSomething wedged between the rocks beneath his feet. He pokesat it.\"\n", + "predicted_category = predict_text_category(rfc, text_to_predict)\n", + "print(\"Predicted Category:\", predicted_category)" + ], + "metadata": { + "id": "-I4AFZTs7nk3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text_to_predict = \"Arthur brushes past Mal, shaking his head. She nears Cobb. Looks out at the DROP.\"\n", + "predicted_category = predict_text_category(rfc, text_to_predict)\n", + "print(\"Predicted Category:\", predicted_category)" + ], + "metadata": { + "id": "D0h_FVDf96hm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text_to_predict = \"A script is a list of programmatically-written instructions that can be carried out on command.\"\n", + "predicted_category = predict_text_category(rfc, text_to_predict)\n", + "print(\"Predicted Category:\", predicted_category)" + ], + "metadata": { + "id": "y0aYPEJF-Gtk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text_to_predict = \"A controller is an individual who has responsibility for all accounting-related activities, including high-level accounting, managerial accounting, and finance activities, within a company.\"\n", + "predicted_category = predict_text_category(rfc, text_to_predict)\n", + "print(\"Predicted Category:\", predicted_category)" + ], + "metadata": { + "id": "4JYiO7P4-XCx" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file