Skip to content
Snippets Groups Projects
Commit 9d436222 authored by 윤성혁's avatar 윤성혁
Browse files

add

parent 35fdd65c
Branches
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler from imblearn.over_sampling import RandomOverSampler
#데이터 로드 #데이터 로드
por = pd.read_csv("./student-por.csv") por = pd.read_csv("./student-por.csv")
math = pd.read_csv("./student-mat.csv") math = pd.read_csv("./student-mat.csv")
data = pd.concat([por, math], ignore_index=True) data = pd.concat([por, math], ignore_index=True)
ros = RandomOverSampler(random_state=0,sampling_strategy='auto') ros = RandomOverSampler(random_state=0,sampling_strategy='auto')
X, Y = ros.fit_resample(data.drop(['Dalc','Walc'],axis=1),data['Walc']) X, Y = ros.fit_resample(data.drop(['Dalc','Walc'],axis=1),data['Walc'])
data = pd.concat([X,Y],axis=1) data = pd.concat([X,Y],axis=1)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Null값이 없는 것을 확인, G1 G2 G3는 Grade로 통합 #Null값이 없는 것을 확인, G1 G2 G3는 Grade로 통합
data["Grade"] = data['G1']+data['G2']+data['G3'] data["Grade"] = data['G1']+data['G2']+data['G3']
data = data.drop(columns=['G1','G2','G3']) data = data.drop(columns=['G1','G2','G3'])
print(data.info()) print(data.info())
data.shape data.shape
``` ```
%% Output %% Output
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989 RangeIndex: 1990 entries, 0 to 1989
Data columns (total 30 columns): Data columns (total 30 columns):
# Column Non-Null Count Dtype # Column Non-Null Count Dtype
--- ------ -------------- ----- --- ------ -------------- -----
0 school 1990 non-null object 0 school 1990 non-null object
1 sex 1990 non-null object 1 sex 1990 non-null object
2 age 1990 non-null int64 2 age 1990 non-null int64
3 address 1990 non-null object 3 address 1990 non-null object
4 famsize 1990 non-null object 4 famsize 1990 non-null object
5 Pstatus 1990 non-null object 5 Pstatus 1990 non-null object
6 Medu 1990 non-null int64 6 Medu 1990 non-null int64
7 Fedu 1990 non-null int64 7 Fedu 1990 non-null int64
8 Mjob 1990 non-null object 8 Mjob 1990 non-null object
9 Fjob 1990 non-null object 9 Fjob 1990 non-null object
10 reason 1990 non-null object 10 reason 1990 non-null object
11 guardian 1990 non-null object 11 guardian 1990 non-null object
12 traveltime 1990 non-null int64 12 traveltime 1990 non-null int64
13 studytime 1990 non-null int64 13 studytime 1990 non-null int64
14 failures 1990 non-null int64 14 failures 1990 non-null int64
15 schoolsup 1990 non-null object 15 schoolsup 1990 non-null object
16 famsup 1990 non-null object 16 famsup 1990 non-null object
17 paid 1990 non-null object 17 paid 1990 non-null object
18 activities 1990 non-null object 18 activities 1990 non-null object
19 nursery 1990 non-null object 19 nursery 1990 non-null object
20 higher 1990 non-null object 20 higher 1990 non-null object
21 internet 1990 non-null object 21 internet 1990 non-null object
22 romantic 1990 non-null object 22 romantic 1990 non-null object
23 famrel 1990 non-null int64 23 famrel 1990 non-null int64
24 freetime 1990 non-null int64 24 freetime 1990 non-null int64
25 goout 1990 non-null int64 25 goout 1990 non-null int64
26 health 1990 non-null int64 26 health 1990 non-null int64
27 absences 1990 non-null int64 27 absences 1990 non-null int64
28 Walc 1990 non-null int64 28 Walc 1990 non-null int64
29 Grade 1990 non-null int64 29 Grade 1990 non-null int64
dtypes: int64(13), object(17) dtypes: int64(13), object(17)
memory usage: 466.5+ KB memory usage: 466.5+ KB
None None
(1990, 30) (1990, 30)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#int형 data의 EDA를 보기 위해 numbers에 저장, Outlier가 없는 것을 확인 #int형 data의 EDA를 보기 위해 numbers에 저장, Outlier가 없는 것을 확인
numbers = data.select_dtypes('int64').columns numbers = data.select_dtypes('int64').columns
numbers = data[numbers] numbers = data[numbers]
numbers.hist(figsize=(18,18), edgecolor='white') numbers.hist(figsize=(18,18), edgecolor='white')
plt.show() plt.show()
display(numbers.describe()) display(numbers.describe())
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#각 변수간의 상관관계 분석 (숫자형만) #각 변수간의 상관관계 분석 (숫자형만)
fig, ax = plt.subplots(figsize=(12, 12)) fig, ax = plt.subplots(figsize=(12, 12))
#corr()로 상관관계 계산, vmin-vmax로 최대 최소값 지정, cmap으로 색상 결정, #corr()로 상관관계 계산, vmin-vmax로 최대 최소값 지정, cmap으로 색상 결정,
#annot로 숫자 표시 여부 결정 #annot로 숫자 표시 여부 결정
sns.heatmap(numbers.corr(), vmin=-1, vmax=1, sns.heatmap(numbers.corr(), vmin=-1, vmax=1,
cmap='RdYlBu_r', annot=True) cmap='RdYlBu_r', annot=True)
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#명목형 변수를 One-Hot encoding으로 정수형으로 바꿔줌 #명목형 변수를 One-Hot encoding으로 정수형으로 바꿔줌
data_dummies = pd.get_dummies(data) data_dummies = pd.get_dummies(data)
data_dummies.head(5) data_dummies.head(5)
``` ```
%% Output %% Output
age Medu Fedu traveltime studytime failures famrel freetime goout \ age Medu Fedu traveltime studytime failures famrel freetime goout \
0 18 4 4 2 2 0 4 3 4 0 18 4 4 2 2 0 4 3 4
1 17 1 1 1 2 0 5 3 3 1 17 1 1 1 2 0 5 3 3
2 15 1 1 1 2 0 4 3 2 2 15 1 1 1 2 0 4 3 2
3 15 4 2 1 3 0 3 2 2 3 15 4 2 1 3 0 3 2 2
4 16 3 3 1 2 0 4 3 2 4 16 3 3 1 2 0 4 3 2
health ... activities_no activities_yes nursery_no nursery_yes \ health ... activities_no activities_yes nursery_no nursery_yes \
0 3 ... 1 0 0 1 0 3 ... 1 0 0 1
1 3 ... 1 0 1 0 1 3 ... 1 0 1 0
2 3 ... 1 0 0 1 2 3 ... 1 0 0 1
3 5 ... 0 1 0 1 3 5 ... 0 1 0 1
4 5 ... 1 0 0 1 4 5 ... 1 0 0 1
higher_no higher_yes internet_no internet_yes romantic_no romantic_yes higher_no higher_yes internet_no internet_yes romantic_no romantic_yes
0 0 1 1 0 1 0 0 0 1 1 0 1 0
1 0 1 0 1 1 0 1 0 1 0 1 1 0
2 0 1 0 1 1 0 2 0 1 0 1 1 0
3 0 1 0 1 0 1 3 0 1 0 1 0 1
4 0 1 1 0 1 0 4 0 1 1 0 1 0
[5 rows x 56 columns] [5 rows x 56 columns]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#feature, label 분리, y #feature, label 분리, y
X = data_dummies.drop(['Walc'], axis=1) X = data_dummies.drop(['Walc'], axis=1)
y_w = data_dummies['Walc'] y_w = data_dummies['Walc']
# y_d = data_dummies['Dalc'] # y_d = data_dummies['Dalc']
X.head(5) X.head(5)
``` ```
%% Output %% Output
age Medu Fedu traveltime studytime failures famrel freetime goout \ age Medu Fedu traveltime studytime failures famrel freetime goout \
0 18 4 4 2 2 0 4 3 4 0 18 4 4 2 2 0 4 3 4
1 17 1 1 1 2 0 5 3 3 1 17 1 1 1 2 0 5 3 3
2 15 1 1 1 2 0 4 3 2 2 15 1 1 1 2 0 4 3 2
3 15 4 2 1 3 0 3 2 2 3 15 4 2 1 3 0 3 2 2
4 16 3 3 1 2 0 4 3 2 4 16 3 3 1 2 0 4 3 2
health ... activities_no activities_yes nursery_no nursery_yes \ health ... activities_no activities_yes nursery_no nursery_yes \
0 3 ... 1 0 0 1 0 3 ... 1 0 0 1
1 3 ... 1 0 1 0 1 3 ... 1 0 1 0
2 3 ... 1 0 0 1 2 3 ... 1 0 0 1
3 5 ... 0 1 0 1 3 5 ... 0 1 0 1
4 5 ... 1 0 0 1 4 5 ... 1 0 0 1
higher_no higher_yes internet_no internet_yes romantic_no romantic_yes higher_no higher_yes internet_no internet_yes romantic_no romantic_yes
0 0 1 1 0 1 0 0 0 1 1 0 1 0
1 0 1 0 1 1 0 1 0 1 0 1 1 0
2 0 1 0 1 1 0 2 0 1 0 1 1 0
3 0 1 0 1 0 1 3 0 1 0 1 0 1
4 0 1 1 0 1 0 4 0 1 1 0 1 0
[5 rows x 55 columns] [5 rows x 55 columns]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(y_w.head(5)) print(y_w.head(5))
# print(y_d.head(5)) # print(y_d.head(5))
``` ```
%% Output %% Output
0 1 0 1
1 1 1 1
2 3 2 3
3 1 3 1
4 2 4 2
Name: Walc, dtype: int64 Name: Walc, dtype: int64
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Weekend 예측 #Weekend 예측
X_train, X_test, y_train, y_test = train_test_split(X,y_w,test_size=0.3) X_train, X_test, y_train, y_test = train_test_split(X,y_w,test_size=0.3)
print("X_train's shape : ", X_train.shape) print("X_train's shape : ", X_train.shape)
print("X_test's shape : ", X_test.shape) print("X_test's shape : ", X_test.shape)
print("y_train's shape : ", y_train.shape) print("y_train's shape : ", y_train.shape)
print("y_test's shape : ", y_test.shape) print("y_test's shape : ", y_test.shape)
``` ```
%% Output %% Output
X_train's shape : (1393, 55) X_train's shape : (1393, 55)
X_test's shape : (597, 55) X_test's shape : (597, 55)
y_train's shape : (1393,) y_train's shape : (1393,)
y_test's shape : (597,) y_test's shape : (597,)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from lightgbm import LGBMClassifier from lightgbm import LGBMClassifier
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn import svm from sklearn import svm
from sklearn import tree from sklearn import tree
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier
import torch import torch
import torch.nn as nn import torch.nn as nn
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
para_grid = { para_grid = {
'n_estimators' : [150,200], 'n_estimators' : [150,200],
'max_depth' : [10,15,20], 'max_depth' : [10,15,20],
'gamma' : [0.1,0.5,1] 'gamma' : [0.1,0.5,1]
} }
para_agrid = { para_agrid = {
'n_estimators' : [150,200] 'n_estimators' : [150,200]
} }
para_lgrid = { para_lgrid = {
'max_depth' : [20,200], 'max_depth' : [20,200],
'min_child_weight' : [3,5,10,60], 'min_child_weight' : [3,5,10,60],
'gamma' : [0,8,1] 'gamma' : [0,8,1]
} }
para_cgrid = { para_cgrid = {
'n_estimators' : [150,200], 'n_estimators' : [150,200],
'max_depth' : [20,200], 'max_depth' : [20,200],
'min_samples_split' : [3,5,10], 'min_samples_split' : [3,5,10],
'learning_rate' : [0.1,0.3,0.5,0.7,0.9] 'learning_rate' : [0.1,0.3,0.5,0.7,0.9]
} }
svc = svm.SVC() svc = svm.SVC()
clf = tree.DecisionTreeClassifier() clf = tree.DecisionTreeClassifier()
rf = RandomForestClassifier(max_depth=2, random_state=0) rf = RandomForestClassifier(max_depth=2, random_state=0)
agb = AdaBoostClassifier(random_state=0) agb = AdaBoostClassifier(random_state=0)
xgb = XGBClassifier(eval_metric = 'mlogloss',learning_rate=0.4,subsample=0.7,colsample_bytree=0.5) xgb = XGBClassifier(eval_metric = 'mlogloss',learning_rate=0.4,subsample=0.7,colsample_bytree=0.5)
lgb = LGBMClassifier(learning_rate=0.1,subsample=1) lgb = LGBMClassifier(learning_rate=0.1,subsample=1)
gb = GradientBoostingClassifier(max_features='sqrt',min_samples_leaf=1) gb = GradientBoostingClassifier(max_features='sqrt',min_samples_leaf=1)
grid_s = GridSearchCV(estimator=agb,param_grid=para_agrid,n_jobs=-1,verbose=2) grid_s = GridSearchCV(estimator=agb,param_grid=para_agrid,n_jobs=-1,verbose=2)
grid_s.fit(X_train,y_train) grid_s.fit(X_train,y_train)
print('final params', grid_s.best_params_) print('final params', grid_s.best_params_)
print('best score', grid_s.best_score_) print('best score', grid_s.best_score_)
# gb.fit(X_train,y_train) # gb.fit(X_train,y_train)
model = grid_s.best_estimator_ model = grid_s.best_estimator_
pred = model.predict(X_test) pred = model.predict(X_test)
accuracy = accuracy_score(y_test,pred) accuracy = accuracy_score(y_test,pred)
print(accuracy) print(accuracy)
``` ```
%% Output %% Output
Fitting 5 folds for each of 2 candidates, totalling 10 fits Fitting 5 folds for each of 2 candidates, totalling 10 fits
final params {'n_estimators': 200} final params {'n_estimators': 200}
best score 0.4005595523581135 best score 0.4005595523581135
0.38190954773869346 0.38190954773869346
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import eli5 import eli5
from eli5.sklearn import PermutationImportance from eli5.sklearn import PermutationImportance
from sklearn.inspection import plot_partial_dependence, permutation_importance from sklearn.inspection import plot_partial_dependence, permutation_importance
from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200, max_depth=20, min_samples_split=3, learning_rate=0.9, model = GradientBoostingClassifier(n_estimators=200, max_depth=20, min_samples_split=3, learning_rate=0.9,
max_features='sqrt',min_samples_leaf=1) max_features='sqrt',min_samples_leaf=1)
model.fit(X_train,y_train) model.fit(X_train,y_train)
# result = permutation_importance(model, scoring = "accuracy", random_state=1).fit(X_test,y_test) # result = permutation_importance(model, scoring = "accuracy", random_state=1).fit(X_test,y_test)
## result = PermutationImportance(model, scoring = "accuracy", random_state=1).fit(X_test,y_test) result = PermutationImportance(model, scoring = "accuracy", random_state=1).fit(X_test,y_test)
# result.importances_mean # result.importances_mean
## eli5.show_weights(result, top = 30, feature_names = X_train.columns.tolist()) eli5.show_weights(result, top = 30, feature_names = X_test.columns.tolist())
# disp = plot_partial_dependence(model, X_train, [1, 2]) # disp = plot_partial_dependence(model, X_train, [1, 2])
disp = plot_partial_dependence(model,
X_train,
features = ['goout','age','studytime'],
target=5,
kind='individual')
# print(disp)
``` ```
%% Output %% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
"""
disp = plot_partial_dependence(model,
X_train,
features = [('goout','age'),'studytime'],
target=2)
"""
```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
""" """
from sklearn.model_selection import validation_curve from sklearn.model_selection import validation_curve
train_scores, valid_scores = validation_curve( train_scores, valid_scores = validation_curve(
GradientBoostingClassifier(max_features='sqrt',min_samples_split=5,min_samples_leaf=1,learning_rate=0.2), GradientBoostingClassifier(max_features='sqrt',min_samples_split=5,min_samples_leaf=1,learning_rate=0.2),
X_train, y_train, "max_depth", np.arange(10,20,2), cv=5) X_train, y_train, "max_depth", np.arange(10,20,2), cv=5)
""" """
``` ```
%% Output %% Output
'\nfrom sklearn.model_selection import validation_curve\ntrain_scores, valid_scores = validation_curve(\n GradientBoostingClassifier(max_features=\'sqrt\',min_samples_split=5,min_samples_leaf=1,learning_rate=0.2),\n X_train, y_train, "max_depth", np.arange(10,20,2), cv=5)\n' '\nfrom sklearn.model_selection import validation_curve\ntrain_scores, valid_scores = validation_curve(\n GradientBoostingClassifier(max_features=\'sqrt\',min_samples_split=5,min_samples_leaf=1,learning_rate=0.2),\n X_train, y_train, "max_depth", np.arange(10,20,2), cv=5)\n'
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
""" """
from sklearn.inspection import plot_partial_dependence from sklearn.inspection import plot_partial_dependence
plot_partial_dependence(clf, X, features) plot_partial_dependence(clf, X, features)
""" """
``` ```
%% Output %% Output
'\nfrom sklearn.inspection import plot_partial_dependence\nplot_partial_dependence(clf, X, features) \n' '\nfrom sklearn.inspection import plot_partial_dependence\nplot_partial_dependence(clf, X, features) \n'
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment