In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
In [2]:
# Load California Housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target
print(f"Features: {housing.feature_names}")
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")Features: ['MedInc', 'HouseAge', ...]
Samples: 20640, Features: 8
In [3]:
# Split into train/test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")Train: 16512, Test: 4128
In [4]:
# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained!")Model trained!
In [5]:
# Make predictions
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {np.sqrt(mse):.4f}")MSE: 0.5559
RMSE: 0.7456
In [6]:
# Feature importance (coefficients)
for name, coef in zip(housing.feature_names, model.coef_):
print(f"{name:>12}: {coef:+.4f}") MedInc: +0.4367
HouseAge: +0.0094
...
In [7]:
# Plot predictions vs actual
plt.scatter(y_test, y_pred, alpha=0.3, s=5)
plt.plot([0, 5], [0, 5], 'r--', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Linear Regression: Predicted vs Actual")
plt.show()