更新时间: 试题数量: 购买人数: 提供作者:

有效期: 个月

章节介绍: 共有个章节

收藏
搜索
题库预览
data = __________ # 将 'horsepower' 列转换为数值类型,并(删除)处理转换中的异常值 data['horsepower'] = __________ data = __________ # 对数据集进行标准化处理 data[numerical_features] = __________ # 选择特征,自变量和目标变量 selected_features = __________ X = __________ y = __________ # 划分数据集为训练集和测试集(训练集占8成) X_train, X_test, y_train, y_test = __________ # 保存清洗和处理后的数据(不存储额外的索引号) cleaned_data.to_csv('2.1.1_cleaned_data.csv', index=False) # 读取一个Excel文件,并将读取到的数据存储在变量data中 data = __________ # 处理数据集中的缺失值 initial_row_count = __________ data = __________ final_row_count = __________ # 删除重复值 data = __________ data[numerical_features] = __________ # 选择特征 selected_features = [_________] X = __________ y = __________ # 创建目标变量 y = __________ # 数据划分(测试集占20%) X_train, X_test, y_train, y_test = __________ # 合并处理后得到的数据,并将其保存(保存中不用额外创建索引) cleaned_data = __________ cleaned_data.to_csv('2.1.2_cleaned_data.csv', index=False) # 加载数据 data = __________ # 显示前五行的数据 print(data.head()) # 使用IQR处理异常值 Q1 = __________ Q3 = __________ IQR = Q3-Q1 data_cleaned = data[(data[numerical_cols] < (Q1 - 1.5 * IQR)) | (data[numerical_cols] > (Q3 + 1.5 * IQR)),].any(axis=1)] # 检查处理重复值 duplicates = __________ # 对数据集进行归一化处理 data_cleaned[numerical_cols] = __________ # 设定目标变量 target_variable = __________ # 定义特征和目标 X = __________ y = __________ # 划分数据集(训练集占80%) X_train, X_test, y_train, y_test = __________
# 保存清洗后的数据到CSV __________ # 加载数据集并指定编码为gbk data = __________ # 查看表格基本信息 print(__________) # 修改列名 data.rename(columns={'病人ID':'患者ID'},inplace=True) # 增加诊断延迟和病程列 data["诊断延迟"] = __________ # 删除不合理的数据 data = data[(_________>=0) & (__________>0) & (__________<120)] data.drop_duplicates(inplace=True) # 对需要归一化的列进行处理 columns_to_normalize = [_________] data[columns_to_normalize] = __________ # 绘制柱状图 __________ # 绘制散点图 __________ # 保存处理后得数据 data.to_csv(output_path, index=False) # 加载数据集 data = __________ # 查看表格基本信息 print(data.info()) # 显示每一列的空缺值数量 print(data.isnull().sum()) # 删除含有缺失值的行 data_cleaned = __________ # 转换'Your age'列的数据类型为整数类型,并处理异常值 data_cleaned.loc[:, 'Your age'] = pd.to_numeric(data_cleaned['Your age'], errors='coerce') data_cleaned.loc[:, 'Your age'].astype(int) # 检查和删除重复值 data_cleaned = __________ # 归一化 data_cleaned['How do you describe your current level of fitness ?'] = label_encoder.fit_transform(data_cleaned['How do you describe your current level of fitness ?']) # 绘制饼图 exercise_frequency_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors) # 划分数据(测试集占比20%) train_data, test_data = __________ # 保存处理后的数据 cleaned_file_path = '2.1.5_cleaned_data.csv' data_filled.to_csv(cleaned_file_path, index=False) # 加载数据 data = __________ # 显示前五行的数据 print(data.head()) # 分割训练集和测试集(测试集20%) X_train, X_test, y_train, y_test = __________ # 训练Logistic回归模型(最大迭代次数为1000次) model = __________ # 训练Logistic回归模型 model.fit(X_train,y_train) # 保存模型 with open('2.2.1_model.pkl', 'wb') as file: pickle.dump(model, file) # 预测并保存结果 y_pred = __________ accuracy = __________
# 创建随机森林回归模型实例(创建的决策树的数量为100) rf_model = __________ rf_model = RandomForestRegressor(n_estimators=100, random_state=42) # 训练随机森林回归模型 rf_model.fit(X_train, y_train) # 使用随机森林模型进行预测 y_pred_rf = __________ y_pred_rf = rf_model.predict(X_test) # 保存新的结果 results_rf_df.to_csv('2.2.2_results_rf.txt', index=False) # 加载数据集 df = __________ df = pd.read_csv('fitness analysis.csv') # 显示前五行数据 print(_________) print(df.head(0)) # 选择相关特征进行建模 X = __________ X = pd.get_dummies(X) y = __________ y = df['Your age'].apply(lambda x: int(x.split('(')[0])) # 将年龄段转为数值变量 X_train,X_test,y_train,y_test = __________ X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=42) # 创建随机森林回归模型(创建的决策树的数量为100) rf_model = __________ rf_model = RandomForestRegressor(n_estimators=100, random_state=42) # 训练随机森林回归模型 rf_model.fit(X_train, y_train) # 保存训练好的模型 with open('2.2.3_model.pkl','wb') as model_file: pickle.dump(rf_model,model_file) # 进行结果预测 y_pred = __________ y_pred = rf_model.predict(X_test) # 使用测试工具对模型进行测试,并记录测试结果 train_score = __________ test_score = __________ mse = __________ r2 = __________ train_score = rf_model.score(X_train,y_train) #训练集分数 test_score = rf_model.score(X_test,y_test) #测试集分数 mse = mean_squared_error(y_test,y_pred) #均方误差 r2 = r2_score(y_test,y_pred) #决定系数
1 2