```python
andas as pd
file_name='加州房价数据集.csv'
housing=pd.read_csv(open(file_name))
housing.info()
median = housing["卧室总数"].median()
housing["卧室总数"].fillna(median, inplace=True) # option 3
housing.info()
counts=housing["离海的距离"].value_counts()
de=housing.describe()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
import numpy as np
housing["income_cat"] = pd.cut(housing["收入中位数"],
bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing.head()
housing["income_cat"].hist()
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_train_set["income_cat"].value_counts() / len(strat_train_set)
strat_test_set["income_cat"].value_counts()/len(strat_test_set)
strat_train_set.drop('income_cat',axis=1,inplace=True)
strat_test_set.drop('income_cat',axis=1,inplace=True)
housing = strat_train_set.copy()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
corr_matrix=housing.corr()
corr2=corr_matrix[" 房屋价值中位数"].sort_values(ascending=False)
housing.plot(kind="scatter",x="地理经度",y="地理纬度",alpha=0.4,
s=housing["人口规模"]/100,label="population",
c=" 房屋价值中位数",cmap=plt.get_cmap("jet"),colorbar=True,sharex=False)
plt.legend()
plt.show()
housing = train_set.drop(" 房屋价值中位数",axis=1) #原始数据集并未发生改变
housing_labels=train_set[" 房屋价值中位数"].copy()
plt.pie(counts,labels=counts.index, autopct='%3.1f%%')
plt.title('房屋到海距离占比')
def one(df):
df_onehot=pd.get_dummies(df.iloc[:,-1])
df1= pd.concat([df,df_onehot], axis=1)
return df1
df=one(housing)
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num = housing.drop("离海的距离", axis=1) #去除非数值类特征
imputer.fit(housing_num)
num_attribs = list(housing_num)#返回的为列名[col1,col2,....]
cat_attribs = ["离海的距离"]
```
该回答引用ChatGPT
特征工程:
import pandas as pd
from sklearn.preprocessing import StandardScaler
# 读取数据集
df = pd.read_csv('housing.csv')
# 划分特征和标签
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
# 对数据进行缺失值填充和特征缩放
X['total_bedrooms'].fillna(X['total_bedrooms'].median(), inplace=True)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
回归模型:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练线性回归模型
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
print('Linear Regression MSE:', lr_mse)
# 训练决策树模型
dt = DecisionTreeRegressor(max_depth=5)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_pred)
print('Decision Tree MSE:', dt_mse)
# 训练随机森林模型
rf = RandomForestRegressor(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
print('Random Forest MSE:', rf_mse)
加州房价数据集是一个经典的回归问题,可以使用回归算法进行研究和预测。以下是使用Python中的Scikit-learn库来实现加州房价预测的示例代码:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# 加载加州房价数据集
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df[data.feature_names], df['target'], test_size=0.3, random_state=42)
# 创建线性回归模型并训练
lr = LinearRegression()
lr.fit(X_train, y_train)
# 使用测试集进行预测并评估模型性能
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression:\nMSE = {mse_lr:.4f}, R2 score = {r2_lr:.4f}")
# 创建决策树回归模型并训练
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
# 使用测试集进行预测并评估模型性能
y_pred_dt = dt.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
print(f"Decision Tree Regression:\nMSE = {mse_dt:.4f}, R2 score = {r2_dt:.4f}")
# 创建随机森林回归模型并训练
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# 使用测试集进行预测并评估模型性能
y_pred_rf = rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Regression:\nMSE = {mse_rf:.4f}, R2 score = {r2_rf:.4f}")
这个例子中,我们使用Scikit-learn库中的fetch_california_housing函数加载数据集,并使用train_test_split函数将数据集划分为训练集和测试集。然后,我们创建三个回归模型:线性回归模型、决策树回归模型和随机森林回归模型。最后,我们使用测试集进行预测,并使用均方误差(MSE)和R2评分评估每个模型的性能。最终结果表明,随机森林回归模型具有最佳性能。
当然,对于一个实际的项目,通常需要更复杂的数据预处理和特征工程,以及更复杂的
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# 导入数据
file_name = '加州房价数据集.csv'
housing = pd.read_csv(file_name)
# 缺失值处理
median = housing['卧室总数'].median()
housing['卧室总数'].fillna(median, inplace=True)
# 划分训练集和测试集
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
# 添加收入类别特征
housing['income_cat'] = pd.cut(housing['收入中位数'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
# 分层抽样
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# 去除收入类别特征
strat_train_set.drop('income_cat', axis=1, inplace=True)
strat_test_set.drop('income_cat', axis=1, inplace=True)
# 特征工程:数值特征处理
num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('std_scaler', StandardScaler())
])
# 特征工程:分类特征处理
cat_attribs = ['ocean_proximity']
cat_pipeline = Pipeline([
('onehot', OneHotEncoder())
])
# 合并数值特征和分类特征处理结果
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', cat_pipeline, cat_attribs)
])
# 对训练集和测试集进行特征处理
X_train = full_pipeline.fit_transform(strat_train_set)
y_train = strat_train_set['median_house_value']
X_test = full_pipeline.transform(strat_test_set)
y_test = strat_test_set['median_house_value']
# 建立线性回归模型
lin_reg = LinearRegression()
# 训练模型
lin_reg.fit(X_train, y_train)
# 预测房价
y_pred = lin_reg.predict(X_test)
# 评估模型
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('均方误差(MSE):', mse)
print('均方根误差(RMSE):', rmse)
可以使用scikit-learn机器学习库,其中包含许多有用的回归算法。我们可以使用加州房价数据集设置和准备步骤。如下所示
#加载加州房价数据集
import pandas as pd
file_name='加州房价数据集.csv'
housing=pd.read_csv(open(file_name))
#处理缺失值
median = housing["卧室总数"].median()
housing["卧室总数"].fillna(median, inplace=True) # option 3
#创建income_cat 特征,以在小测试集中比例正确地原始数据
import numpy as np
housing["income_cat"] = pd.cut(housing["收入中位数"],
bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
#划分训练集和测试集
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
参考GPT和自己的思路,以下是特征工程和回归算法建模的Python代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
# 加载数据
housing = pd.read_csv('加州房价数据集.csv')
# 缺失值填充
median = housing['卧室总数'].median()
housing['卧室总数'].fillna(median, inplace=True)
imputer = Imputer(strategy='median')
housing_num = housing.drop('离海的距离', axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
# 特征处理
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit_transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).fit_transform(X)
num_attribs = list(housing_num)
cat_attribs = ['离海的距离']
cat_encoder = LabelBinarizerPipelineFriendly()
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('attribs_adder', CombinedAttributesAdder()),
('imputer', Imputer(strategy='median')),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', cat_encoder),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
# 线性回归模型
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
# 决策树回归模型
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
# 随机森林回归模型
forest_reg = RandomForestRegressor
如果对您有帮助,请给与采纳,谢谢。
方案来自 梦想橡皮擦 狂飙组基于 GPT 编写的 “程秘”
特征工程是机器学习中非常重要的一步,它的目的是从原始数据中提取出对预测目标有用的特征,并对特征进行处理、转换和选择。对于房价预测问题,以下是一些可能有用的特征:
房屋的面积和房间数量:通常情况下,房屋的面积和房间数量与房价正相关。
房屋所在的地理位置:房屋的地理位置通常是影响房价的一个重要因素。可以考虑使用经纬度、邮编等信息。
房屋的建造年份和状况:房屋的新旧和维护情况通常是影响房价的因素之一。
社区的信息:社区的设施、教育水平、治安等信息也可能对房价有影响。
在选择特征时,需要考虑特征之间的相关性,尽量选择独立的特征。
回归算法是一种常用的预测模型,可以对连续性变量进行预测。以下是一些可能适用于房价预测问题的回归算法:
对于回归算法的建模,可以使用交叉验证等方法进行模型的训练和评估,同时可以使用一些指标来评估模型的预测效果,比如均方根误差(RMSE)和决定系数(R2)等。
针对加州房价数据集,需要进行特征工程和建立回归模型对房价进行预测。下面是一些解决方案和代码示例:
特征工程:
回归模型:
下面是部分代码示例:
python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
# 加载数据集
file_name='加州房价数据集.csv'
housing=pd.read_csv(open(file_name))
# 缺失值填充
median = housing["卧室总数"].median()
housing["卧室总数"].fillna(median, inplace=True)
# 分层抽样
housing["income_cat"] = pd.cut(housing["收入中位数"],
bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2
你把数据文件发给我一份
方法:
1、采用特征工程处理技术,并分析影响加州房价的影响因素,建立相应的数据模型;
2、采用回归算法,实现预测和评估加州房价;
3、运用可视化技术,结合实际案例,分析预测结果,并进行分析。
研究结论:
通过本研究,可以探究特征工程以及回归算法在预测和评估加州房价时的准确性和效率 。本研究最终将通过可视化技术,对预测结果和案例分析进行研究,以提高特征工程以及回归算法预测和评估加州房价的准确性和效率。