RGITX
Home
WEB LAB
ADA
⚠️ This action is not allowed during the exam.
ML LAB
on March 05, 2025
PROGRAM 1
California Housing Analysis
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import fetch_california_housing # Load the California Housing dataset def load_data(): data = fetch_california_housing(as_frame=True) df = data['data'] df['Target'] = data['target'] # Add the target (house value) to the DataFrame return df # Function to plot histograms for numerical features def plot_histograms(df): numerical_features = df.select_dtypes(include='number').columns df[numerical_features].hist(bins=20, figsize=(15, 10), color='skyblue', edgecolor='black') plt.suptitle('Histograms of Numerical Features', fontsize=16) plt.tight_layout() plt.show() # Function to generate box plots and identify outliers def plot_boxplots(df): numerical_features = df.select_dtypes(include='number').columns for feature in numerical_features: plt.figure(figsize=(8, 6)) sns.boxplot(x=df[feature], color='lightblue') plt.title(f'Box Plot of {feature}', fontsize=14) plt.xlabel(feature, fontsize=12) plt.tight_layout() plt.show() # Identify outliers using the IQR method Q1 = df[feature].quantile(0.25) Q3 = df[feature].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)] print(f"{feature}: {len(outliers)} outliers found.") # Main function to run the analysis def main(): df = load_data() print("Dataset loaded successfully!") print("\nGenerating histograms for numerical features...") plot_histograms(df) print("\nGenerating box plots and identifying outliers...") plot_boxplots(df) if __name__ == "__main__": main()
Copy