import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import RandomState
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
customer_df = pd.read_csv('train-set.csv')
customer_df.head(5)
#查看有缺失值的特征
print(' Shape:', customer_df.shape)
print('\n feature type \n',customer_df.dtypes.value_counts())
isnull_series = customer_df.isnull().sum()
print('\nNull :\n ', isnull_series[isnull_series > 0].sort_values(ascending=False))
customer_df.fillna(customer_df.mean(),inplace=True)
null_column_count =customer_df.isnull().sum()[customer_df.isnull().sum() > 0]
print('## Null Type :\n',customer_df.dtypes[null_column_count.index])
for c in customer_df.columns:
if customer_df[c].dtype=='object':
customer_df[c] = customer_df[c].fillna('N')
customer_df['Gender']=customer_df['Gender'].apply(lambda x: 1 if x=='Male' else 0)
customer_df['Married']=customer_df['Married'].apply(lambda x: 1 if x=='Yes' else 0)
customer_df['Graduated']=customer_df['Graduated'].apply(lambda x: 1 if x=='Yes' else 0)
customer_df['Segmentation']=customer_df['Segmentation'].apply(lambda x: 1 if x=='A' else (2 if x=='B' else (3 if x=='C' else 4)))
y = customer_df['Segmentation']
customer_df.drop(['CustomerID','Segmentation'],axis=1,inplace=True)
features=['Profession','SpendingScore','Category']
for feature in features:
le = LabelEncoder()
le = le.fit(customer_df[feature])
customer_df[feature] = le.transform(customer_df[feature])
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(customer_df)
customer_scaled = scaler.transform(customer_df)
customer_df
customer_scaled
customer_scaled_df = pd.DataFrame(data=customer_scaled,columns=['Gender','Married','Age','Graduated','Profession','WorkExperience','SpendingScore','FamilySize','Category'])
X = customer_scaled_df.iloc[:,:-1]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
import os
warnings.filterwarnings('ignore')
dt_clf = DecisionTreeClassifier(random_state=156)
X_train , X_test , y_train , y_test = train_test_split(customer_scaled_df, y,test_size=0.2, random_state=11)
dt_clf.fit(X_train , y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test , pred)
print('准确度: {0:.4f}'.format(accuracy))
from lightgbm import LGBMClassifier
X_train, X_test, y_train, y_test=train_test_split(customer_df, y,test_size=0.2, random_state=156 )
X1_train, X1_val, y1_train, y1_val=train_test_split(X_train, y_train,test_size=0.2, random_state=156 )
lgbm_wrapper = LGBMClassifier(n_estimators=400)
evals = [(X1_val, y1_val)]
lgbm_wrapper.fit(X1_train, y1_train, early_stopping_rounds=100, eval_metric="logloss",
eval_set=evals, verbose=True)
lgbm_preds = lgbm_wrapper.predict(X_test)
accuracy = accuracy_score(y_test ,lgbm_preds)
print('准确度: {0:.4f}'.format(accuracy))
你好,我是有问必答小助手,非常抱歉,本次您提出的有问必答问题,技术专家团超时未为您做出解答
本次提问扣除的有问必答次数,已经为您补发到账户,我们后续会持续优化,扩大我们的服务范围,为您带来更好地服务。