train = pd.read_csv(r'C:\Users\56484\Desktop\删了双引号的0,1替换成字母的训练集.csv')
test = pd.read_csv(r'C:\Users\56484\Desktop\测试集.csv')
x_train = train[["Parch", "SibSp", "Age", "Sex", "Pclass", "Ticket", "Fare", "Cabin", "Embarked"]].copy()
y_train = train[["Survived"]].copy()
x_test = test[["Parch", "SibSp", "Age", "Sex", "Pclass", "Ticket", "Fare", "Cabin", "Embarked"]].copy()
y_test = test[["Survived"]].copy()
x_train["Age"].fillna(value=x_train["Age"].mean(), inplace=True)
x_test["Age"].fillna(value=x_train["Age"].mean(), inplace=True)
x_train["Cabin"] = x_train["Cabin"].fillna("")
x_test["Cabin"] = x_test["Cabin"].fillna("")
x_train["Embarked"] = x_train["Embarked"].fillna("")
x_test["Embarked"] = x_test["Embarked"].fillna("")
x_test["Fare"] = x_test["Fare"].fillna("*")
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient="records"))
x_test = transfer.fit_transform(x_test.to_dict(orient="records"))
estimator = DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=20)
estimator.fit(x_train, y_train)
print(estimator.predict(x_test))#用测试集来测试就会报错
print(estimator.score(x_test, y_test))