# Standard library imports
import re
import warnings
import pprint
from tabulate import tabulate

# Data processing and numerical computing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import make_scorer
from quantile_forest import RandomForestQuantileRegressor

# Configuration
warnings.filterwarnings('ignore', category=UserWarning)

# Specialized libraries
import cpi
cpi.update() # ensure cpi is up-to-date and able to accurately inflate to today's dollars
from datetime import date
from geopy.distance import geodesic
import osmnx as ox

# Set random seeds for reproducibility
np.random.seed(42)

### Manage restrictions on how many columns and rows display
def manage_df_display(rows=60, columns=20):
    pd.set_option('display.max_rows', rows)
    pd.set_option('display.max_columns', columns)
    pd.set_option('display.width', None)
    

### Analyze Dataframe
def analyze_dataframe(df):
    """Comprehensive analysis of DataFrame for preprocessing issues."""
    
    print("=" * 50)
    print("DATAFRAME ANALYSIS")
    print("=" * 50)
    
    print(f"Shape: {df.shape}")
    print(f"Data types:\n{df.dtypes.value_counts()}")
    
    # Analyze numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n--- NUMERIC COLUMNS ({len(numeric_cols)}) ---")
        for col in numeric_cols:
            series = df[col]
            inf_count = np.isinf(series).sum()
            nan_count = series.isnull().sum()
            large_count = (np.abs(series) > 1e10).sum()
            
            if inf_count > 0 or nan_count > 0 or large_count > 0:
                print(f"{col}:")
                print(f"  - Infinite values: {inf_count}")
                print(f"  - NaN values: {nan_count}")
                print(f"  - Extremely large values: {large_count}")
    
    # Analyze non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
    if len(non_numeric_cols) > 0:
        print(f"\n--- NON-NUMERIC COLUMNS ({len(non_numeric_cols)}) ---")
        for col in non_numeric_cols:
            series = df[col]
            print(f"{col}: {series.dtype}, {series.nunique()} unique values, {series.isnull().sum()} ({series.isnull().sum() / len(series)}) missing")


def column_analysis(df, columns=None, max_categories=10):
    """
    Comprehensive analysis of specified columns with value counts and statistics
    
    Parameters:
    df: pandas DataFrame
    columns: list of column names to analyze (optional)
             If None, analyzes all columns in the DataFrame
    max_categories: maximum number of categories to display per column
    """
    import pandas as pd  # Added import for numeric type checking
    
    # Determine which columns to analyze
    if columns is None:
        columns_to_analyze = df.columns.tolist()
        print("=" * 90)
        print(" DETAILED COLUMN ANALYSIS - ALL COLUMNS")
        print("=" * 90)
    else:
        # Validate that specified columns exist in the DataFrame
        missing_columns = [col for col in columns if col not in df.columns]
        if missing_columns:
            print(f"  Warning: The following columns were not found in the DataFrame: {missing_columns}")
        
        columns_to_analyze = [col for col in columns if col in df.columns]
        if not columns_to_analyze:
            print("! Error: None of the specified columns exist in the DataFrame ! ")
            return
        
        print("=" * 90)
        print(f" DETAILED COLUMN ANALYSIS - SELECTED COLUMNS ({len(columns_to_analyze)})")
        print("=" * 90)
        print(f"Analyzing columns: {', '.join(columns_to_analyze)}")
    
    for column in columns_to_analyze:
        print(f"\n COLUMN: {column}")
        print("=" * 70)
        
        # Basic info
        col_data = df[column]
        data_type = col_data.dtype
        total_rows = len(df)
        missing_count = col_data.isnull().sum()
        non_missing_count = total_rows - missing_count
        
        print(f"Data Type: {data_type}")
        print(f"Total Rows: {total_rows:,} | Non-Missing: {non_missing_count:,} | Missing: {missing_count:,}")
        
        if non_missing_count == 0:
            print(" ! No data to analyze (all values are missing) ! ")
            continue
        
        # Value counts analysis
        print(f"\n VALUE DISTRIBUTION:")
        print("-" * 50)
        
        value_counts = col_data.value_counts().head(max_categories)
        
        # Prepare detailed table
        table_data = []
        cumulative_count = 0
        
        for rank, (value, count) in enumerate(value_counts.items(), 1):
            cumulative_count += count
            percentage = (count / total_rows * 100)
            cumulative_percentage = (cumulative_count / total_rows * 100)
            
            # Format value for display
            display_value = str(value)
            if len(display_value) > 30:
                display_value = display_value[:27] + "..."
            
            table_data.append([
                f"#{rank}",
                display_value,
                f"{count:,}",
                f"{percentage:.2f}%",
                f"{cumulative_percentage:.2f}%"
            ])
        
        headers = ["Rank", "Value", "Count", "Percentage", "Cumulative %"]
        print(tabulate(table_data, headers=headers, tablefmt="fancy_grid"))
        
        # Additional statistics
        print(f"\n STATISTICS:")
        print("-" * 30)
        unique_count = col_data.nunique()
        print(f"• Unique values: {unique_count:,}")
        print(f"• Missing values: {missing_count:,} ({missing_count/total_rows*100:.2f}%)")
        
        if unique_count > 0:
            print(f"• Most frequent: '{value_counts.index[0]}' ({value_counts.iloc[0]:,} times)")
            print(f"• Least frequent: '{value_counts.index[-1]}' ({value_counts.iloc[-1]:,} times)")
        
        # NEW: Numeric statistics for numeric columns
        if pd.api.types.is_numeric_dtype(col_data):
            non_missing_data = col_data.dropna()
            if len(non_missing_data) > 0:
                min_value = non_missing_data.min()
                max_value = non_missing_data.max()
                print(f"• Minimum value: {min_value}")
                print(f"• Maximum value: {max_value}")
            
        # Data quality insights
        if unique_count == non_missing_count:
            print("• All values are unique (potential ID column)")
        elif unique_count == 1:
            print("•  All values are the same (constant column)")
        elif unique_count <= 10:
            print("• Low cardinality (good for categorical analysis)")
        elif unique_count > non_missing_count * 0.8:
            print("• High cardinality (many unique values)")


### Apply IQR-based outlier detection for price and square footage
def calculate_outlier_table(df, columns):
    """Return a DataFrame summarizing outlier percentages for specified columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list of str): List of column names to check for outliers.

    Returns:
        pd.DataFrame: Prettified table with outlier percentages for each column.
    """
    results = []
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            total = len(df)
            left_outliers = df[df[col] < lower_bound]
            right_outliers = df[df[col] > upper_bound]

            left_pct = (len(left_outliers) / total) * 100 if total > 0 else 0
            right_pct = (len(right_outliers) / total) * 100 if total > 0 else 0
            total_pct = left_pct + right_pct

            results.append({
                'Column': col,
                'dtype': df[col].dtype,
                'min': df[col].min(),
                'max': df[col].max(),
                'Total Outliers (%)': round(total_pct, 2),
                'Lower Outliers (%)': round(left_pct, 2),
                'Upper Outliers (%)': round(right_pct, 2)
            })
        else:
            results.append({
                'Column': col,
                'dtype': df[col].dtype,
                'min': df[col].min(),
                'max': df[col].max(),
                'Total Outliers (%)': None,
                'Lower Outliers (%)': None,
                'Upper Outliers (%)': None
            })

    # Create DataFrame from results
    result_df = pd.DataFrame(results).sort_values(by=['Total Outliers (%)'], ascending=False)
    
    # Apply styling to the DataFrame
    styled_df = result_df.style.background_gradient(
        cmap='YlOrRd', 
        subset=['Total Outliers (%)']
    ).format({
        'Total Outliers (%)': '{:.2f}%',
        'Lower Outliers (%)': '{:.2f}%',
        'Upper Outliers (%)': '{:.2f}%',
        'max': '{:.2f}',
        'min': '{:.2f}',
    }).set_properties(**{
        'text-align': 'center',
        'border': '1px solid gray',
        'padding': '5px'
    }).set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#f2f2f2'), 
                                    ('color', 'black'),
                                    ('font-weight', 'bold'),
                                    ('text-align', 'center'),
                                    ('border', '1px solid gray'),
                                    ('padding', '5px')]},
        {'selector': 'caption', 'props': [('caption-side', 'top'), 
                                         ('font-size', '1.2em'),
                                         ('font-weight', 'bold')]}
    ]).set_caption('Outlier Analysis')
    
    return styled_df

def cap_outliers_by_percentile(df: pd.DataFrame, 
                             columns: list, 
                             lower_percentile: float = 0.05,
                             upper_percentile: float = 0.95,
                             inplace: bool = False) -> pd.DataFrame:
    """
    Caps extreme values at specified percentiles for given columns.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame
    columns (list): List of column names to cap
    lower_percentile (float): Lower percentile threshold (default: 0.05 for 5th percentile)
    upper_percentile (float): Upper percentile threshold (default: 0.95 for 95th percentile)
    inplace (bool): Whether to modify the original DataFrame
    
    Returns:
    pd.DataFrame: DataFrame with capped values (if inplace=False)
    """
    # Work with original DataFrame if inplace=True, otherwise create a copy
    df_result = df if inplace else df.copy()
    
    # Track capping statistics
    capping_stats = {}
    
    # Process each specified column
    for col in columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in DataFrame")
            continue
            
        # Skip non-numeric columns
        if not pd.api.types.is_numeric_dtype(df[col]):
            print(f"Warning: Column '{col}' is not numeric, skipping...")
            continue
        
        # Calculate percentile bounds
        lower_bound = df[col].quantile(lower_percentile)
        upper_bound = df[col].quantile(upper_percentile)
        
        # Count values that will be capped
        lower_capped = (df[col] < lower_bound).sum()
        upper_capped = (df[col] > upper_bound).sum()
        
        # Apply capping
        df_result[col] = np.clip(df[col], lower_bound, upper_bound)
        
        # Store statistics
        capping_stats[col] = {
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'lower_capped_count': lower_capped,
            'upper_capped_count': upper_capped,
            'total_capped': lower_capped + upper_capped
        }
    
    # Print summary
    print("Capping Summary:")
    for col, stats in capping_stats.items():
        print(f"  {col}: {stats['total_capped']} values capped "
              f"(Lower: {stats['lower_capped_count']}, Upper: {stats['upper_capped_count']})")
        print(f"    Bounds: [{stats['lower_bound']:.4f}, {stats['upper_bound']:.4f}]")
    
    if not inplace:
        return df_result

        
def calculate_correlation_table(df, feature_columns, target_column, correlation_type='pearson'):
    """Return a DataFrame summarizing correlations between features and target variable.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature_columns (list of str): List of feature column names to correlate with target.
        target_column (str): Name of the target variable column.
        correlation_type (str): Type of correlation to calculate. 
                               Options: 'pearson', 'spearman', 'kendall'. Default: 'pearson'.

    Returns:
        pd.DataFrame: Prettified table with correlation coefficients for each feature.
    """
    import pandas as pd
    import numpy as np
    from scipy import stats
    
    # Validate correlation type
    valid_types = ['pearson', 'spearman', 'kendall']
    if correlation_type not in valid_types:
        raise ValueError(f"correlation_type must be one of {valid_types}")
    
    # Check if target column exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
    
    results = []
    
    for col in feature_columns:
        if col in df.columns:
            # Get non-null values for both columns
            valid_data = df[[col, target_column]].dropna()
            
            if len(valid_data) < 2:
                correlation = np.nan
                p_value = np.nan
                sample_size = len(valid_data)
            else:
                try:
                    if correlation_type == 'pearson':
                        corr_result = stats.pearsonr(valid_data[col], valid_data[target_column])
                    elif correlation_type == 'spearman':
                        corr_result = stats.spearmanr(valid_data[col], valid_data[target_column])
                    elif correlation_type == 'kendall':
                        corr_result = stats.kendalltau(valid_data[col], valid_data[target_column])
                    
                    # Extract scalar values from the result
                    correlation = float(corr_result[0]) if hasattr(corr_result[0], '__len__') and len(corr_result[0]) == 1 else corr_result[0]
                    p_value = float(corr_result[1]) if hasattr(corr_result[1], '__len__') and len(corr_result[1]) == 1 else corr_result[1]
                    
                    # Ensure they are scalars
                    if hasattr(correlation, '__len__'):
                        correlation = correlation[0] if len(correlation) > 0 else np.nan
                    if hasattr(p_value, '__len__'):
                        p_value = p_value[0] if len(p_value) > 0 else np.nan
                        
                    sample_size = len(valid_data)
                except Exception as e:
                    correlation = np.nan
                    p_value = np.nan
                    sample_size = len(valid_data)
            
            # Determine significance level
            if pd.isna(p_value) or p_value is None:
                significance = 'N/A'
            elif p_value < 0.001:
                significance = '***'
            elif p_value < 0.01:
                significance = '**'
            elif p_value < 0.05:
                significance = '*'
            else:
                significance = ''
            
            results.append({
                'Feature': col,
                'dtype': str(df[col].dtype),
                'Correlation': correlation,
                'P-Value': p_value,
                'Significance': significance,
                'Sample Size': sample_size,
                'Abs Correlation': abs(correlation) if not pd.isna(correlation) else np.nan
            })
        else:
            results.append({
                'Feature': col,
                'dtype': 'N/A',
                'Correlation': np.nan,
                'P-Value': np.nan,
                'Significance': 'Column Missing',
                'Sample Size': 0,
                'Abs Correlation': np.nan
            })
    
    # Create DataFrame from results
    result_df = pd.DataFrame(results).sort_values(by=['Abs Correlation'], ascending=False, na_position='last')
    
    # Drop the helper column
    result_df = result_df.drop('Abs Correlation', axis=1)
    
    # Apply styling to the DataFrame
    styled_df = result_df.style.background_gradient(
        cmap='RdYlBu_r', 
        subset=['Correlation'],
        vmin=-1,
        vmax=1
    ).format({
        'Correlation': '{:.4f}',
        'P-Value': '{:.4f}',
        'Sample Size': '{:,}'
    }).set_properties(**{
        'text-align': 'center',
        'border': '1px solid gray',
        'padding': '5px'
    }).set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#f2f2f2'), 
                                    ('color', 'black'),
                                    ('font-weight', 'bold'),
                                    ('text-align', 'center'),
                                    ('border', '1px solid gray'),
                                    ('padding', '5px')]},
        {'selector': 'caption', 'props': [('caption-side', 'top'), 
                                         ('font-size', '1.2em'),
                                         ('font-weight', 'bold')]}
    ]).set_caption(f'Correlation Analysis ({correlation_type.title()})')
    
    return styled_df


def categorize_zoning(zoning_code):
    """
    Categorizes King County zoning codes into consolidated categories.
    
    Parameters:
    -----------
    zoning_code : str
        The zoning designation to categorize
        
    Returns:
    --------
    str
        One of six categories: 'Residential Zones', 'Neighborhood Residential', 
        'Low-Rise Residential', 'Special Use Zones', 'Mixed Use and Commercial', 
        'Industrial and Other'
    """
    
    # Handle missing or null values
    if pd.isna(zoning_code) or zoning_code == '' or zoning_code == 'Unknown':
        return 'Industrial and Other'
    
    # Convert to string and strip whitespace
    zone = str(zoning_code).strip()
    
    # Neighborhood Residential (check first due to specificity)
    if re.match(r'^NR\d*$', zone):  # NR, NR1, NR2, NR3
        return 'Neighborhood Residential'
    
    # Low-Rise Residential
    if re.match(r'^LR\d*\s*(\([^)]*\))?', zone):  # LR1 (M), LR2 (M), LR3 (M), LR3 RC (M)
        return 'Low-Rise Residential'
    
    # Mixed Use and Commercial
    mixed_use_patterns = [
        r'^MU$',                    # MU
        r'^MR\s*(\([^)]*\))?',     # MR (M1)
        r'^MML\s*U/\d+',           # MML U/85
        r'^MUR-\d+',               # MUR-45, MUR-70
        r'^NC\d*P?-\d+\s*(\([^)]*\))?',  # NC1P-55 (M), NC2P-55 (M), NC2-40
        r'^C\d+-\d+\s*(\([^)]*\))?'      # C1-55 (M)
    ]
    
    if any(re.match(pattern, zone) for pattern in mixed_use_patterns):
        return 'Mixed Use and Commercial'
    
    # Special Use Zones
    special_use_patterns = [
        r'^SF\s*\d*\s*(-\w+)?$',   # SF 5000, SF 7200, SF-S, SF-SL
        r'^SR-[\d.]+$',            # SR-4.5, SR-6
        r'^UVSF-\d+$',             # UVSF-1
        r'^SFR\s*\d*$'             # SFR with numbers
    ]
    
    if any(re.match(pattern, zone) for pattern in special_use_patterns):
        return 'Special Use Zones'
    
    # Residential Zones (basic residential patterns)
    residential_patterns = [
        r'^R-\d+$',                # R-1, R-4, R-6, R-8
        r'^R\d+$',                 # R4, R6, R8 (without dash)
        r'^RS-?\d+$',              # RS-7200, RS9600
        r'^RSA\s*\d+$',            # RSA 4, RSA 6
        r'^RSX\s*[\d.]+$',         # RSX 7.2
        r'^RSL\s*(\([^)]*\))?$',   # RSL (M)
        r'^RA[\d.]+[A-Z]*$',       # RA2.5, RA5, RA5P, RA5SO
        r'^R\s+\d+[a-z]?$'         # R 5400d
    ]
    
    if any(re.match(pattern, zone) for pattern in residential_patterns):
        return 'Residential Zones'
    
    # Industrial and Other (catch-all for remaining categories)
    industrial_other_patterns = [
        r'^[LH]DR$',               # LDR, HDR
        r'^L-\d+$',                # L-1, L-3
        r'^UL-\d+$',               # UL-7200
        r'^TC(\s*A\d+)?$',         # TC, TC A3
        r'^O$',                    # O (Office)
        r'^PUD$',                  # PUD
        r'^UR$',                   # UR
        r'^RM\d*(-\d+)?$',         # RM1800, RM-48
        r'^A\d+$',                 # A10, A35
        r'^NMF$'                   # NMF
    ]
    
    if any(re.match(pattern, zone) for pattern in industrial_other_patterns):
        return 'Industrial and Other'
    
    # Default fallback for unrecognized patterns
    return 'Industrial and Other'



import pandas as pd

def extract_sale_warning_codes(df):
    """
    Extract sale warning codes into binary features
    """
    # Clean and split the sale_warning column
    df['sale_warning_clean'] = df['sale_warning'].str.strip()
    df['sale_warning_list'] = df['sale_warning_clean'].str.split()
    
    # Create binary columns for each possible code (1-62)
    for code in range(1, 63):
        code_str = str(code)
        df[f'sale_warning_{code}'] = df['sale_warning_list'].apply(
            lambda x: int(code_str in x) if isinstance(x, list) else 0
        )
    
    # Clean up intermediate columns
    df.drop(columns=['sale_warning_clean', 'sale_warning_list'], inplace=True)
    
    return df

train_data = pd.read_csv('dataset.csv')
test_data = pd.read_csv('test.csv')
manage_df_display(columns=None)
train_data.head()

train_data.describe()

analyze_dataframe(train_data)

==================================================
DATAFRAME ANALYSIS
==================================================
Shape: (200000, 47)
Data types:
int64      36
object      7
float64     4
Name: count, dtype: int64

--- NUMERIC COLUMNS (40) ---
sale_nbr:
  - Infinite values: 0
  - NaN values: 42182
  - Extremely large values: 0

--- NON-NUMERIC COLUMNS (7) ---
sale_date: object, 313 unique values, 0 (0.0) missing
sale_warning: object, 142 unique values, 0 (0.0) missing
join_status: object, 8 unique values, 0 (0.0) missing
city: object, 41 unique values, 0 (0.0) missing
zoning: object, 500 unique values, 0 (0.0) missing
subdivision: object, 10376 unique values, 17550 (0.08775) missing
submarket: object, 19 unique values, 1717 (0.008585) missing

# No true duplicates.
print(train_data.duplicated().sum())

0

'''While no unqiue identifier is in the dataset, we can try to identify properties that have been sold 
more than once by searching and sorting through by the latitude and longitude of the parcel and property subdivision'''

train_data[train_data.duplicated(subset=['latitude', 'longitude', 'subdivision'],
                                 keep=False)].sort_values(by=['latitude', 'longitude', 'subdivision']).head(10)

'''While it is possible for renovations/modifications between sales, altering the phyical characteristics of a property, 
most of the properties linked by 'latitude', 'longitude', 'subdivision' share several physical characteristics (notably year built) suggesting
multiple sales of the same property. Example: id 62015 and 75626 appear to be the same property based on latitude, longitude, year built,
sqft_lot, sqft, and other values.'''

print(train_data.duplicated(subset=['latitude', 'longitude', 'subdivision']).sum())

37403

### Create masks for slicing data into smaller groups

sale_data = ['sale_date', 'sale_price', 'sale_nbr', 'sale_warning']
admin_data = ['join_year', 'join_status']
geo_data = ['latitude', 'longitude', 'area', 'city', 'submarket']
legal_data = ['zoning', 'subdivision', 'present_use']
assessor_data = ['land_val', 'imp_val', 'grade', 'fbsmt_grade', 'condition']
property_data = ['year_built', 'year_reno', 'sqft', 'sqft_lot', 'sqft_fbsmt', 'sqft_1', 'stories', 'beds',
                 'bath_full', 'bath_3qtr', 'bath_half', 'garb_sqft', 'gara_sqft', 'wfnt', 'golf', 'greenbelt', 
                 'noise_traffic', 'view_rainier', 'view_olympics', 'view_cascades', 'view_territorial', 'view_skyline',
                 'view_sound', 'view_lakewash', 'view_lakesamm', 'view_otherwater', 'view_other']

### Confirm all features minus 'id' accounted for
len(sale_data + admin_data + geo_data + legal_data + assessor_data + property_data) + 1 == len(train_data.columns)

True

train_data[admin_data].value_counts()

join_year  join_status     
2025       nochg               126281
           new                  53085
1999       rebuilt - before      3706
2025       rebuilt - after       3095
1999       reno - before         3073
           demo                  2869
2025       reno - before         2791
1999       reno - after          2632
2025       miss99                2468
Name: count, dtype: int64

train_data[train_data['join_status'] == 'nochg'].head(10)

train_data[['sale_price', 'join_status', 'join_year', 'year_reno', 'condition', 'grade']].groupby(by=['join_status', 'join_year']).agg(['min', 'median', 'max'])

train_data = train_data[(train_data['join_year'] == 2025) & (train_data['join_status'] == 'new') | (train_data['join_status'] == 'nochg')]
train_data.head(20)

analyze_dataframe(train_data[sale_data])

==================================================
DATAFRAME ANALYSIS
==================================================
Shape: (179366, 4)
Data types:
object     2
int64      1
float64    1
Name: count, dtype: int64

--- NUMERIC COLUMNS (2) ---
sale_nbr:
  - Infinite values: 0
  - NaN values: 39197
  - Extremely large values: 0

--- NON-NUMERIC COLUMNS (2) ---
sale_date: object, 313 unique values, 0 (0.0) missing
sale_warning: object, 107 unique values, 0 (0.0) missing

### Convert Sale Date to datetime format
train_data['sale_date'] = pd.to_datetime(train_data.sale_date)

### Examine median price per year
train_data[['sale_date', 'sale_price']].groupby(train_data['sale_date'].dt.year).median()

### Convert Sale Price to 2025 dollars
inflation_dict = {}

for year in range(1999, 2025):
    mul = cpi.inflate(1, date(year, 12, 31), items='Housing', area='Seattle-Tacoma-Bellevue WA')
    inflation_dict[year] = mul

inflation_dict.update({2025: 1})


def adjust_for_inflation(dollar_value, year):
    updated_value = dollar_value * inflation_dict[year]
    return updated_value
    
train_data['adjusted_sale_price'] = train_data.apply(lambda row: adjust_for_inflation(row['sale_price'], row['sale_date'].year), axis=1).astype(int)

print(train_data[['sale_date', 'sale_price', 'adjusted_sale_price']].sort_values(by='sale_date'))

        sale_date  sale_price  adjusted_sale_price
12292  1999-01-15      118000               292628
29264  1999-01-15      285000               706771
29265  1999-01-15      149500               370745
19967  1999-01-15      139950               347061
153419 1999-01-15      179361               444797
...           ...         ...                  ...
16444  2025-01-15      886000               886000
158317 2025-01-15      745325               745325
172780 2025-01-15      610000               610000
96707  2025-01-15      865000               865000
50186  2025-01-15      980000               980000

[179366 rows x 3 columns]

# Remove unadjusted sale price
train_data.drop('sale_price', axis=1, inplace=True)

train_data = train_data[~train_data['sale_nbr'].isin([2,3,4,5,7,8,9,10,11])]

analyze_dataframe(train_data)

==================================================
DATAFRAME ANALYSIS
==================================================
Shape: (83483, 47)
Data types:
int64             36
object             6
float64            4
datetime64[ns]     1
Name: count, dtype: int64

--- NUMERIC COLUMNS (40) ---
sale_nbr:
  - Infinite values: 0
  - NaN values: 39197
  - Extremely large values: 0

--- NON-NUMERIC COLUMNS (7) ---
sale_date: datetime64[ns], 313 unique values, 0 (0.0) missing
sale_warning: object, 88 unique values, 0 (0.0) missing
join_status: object, 2 unique values, 0 (0.0) missing
city: object, 40 unique values, 0 (0.0) missing
zoning: object, 348 unique values, 0 (0.0) missing
subdivision: object, 9217 unique values, 7254 (0.08689194207203862) missing
submarket: object, 19 unique values, 801 (0.009594767797036522) missing

train_data['sale_nbr'] = train_data['sale_nbr'].astype('str')
train_data['sale_nbr'] = train_data['sale_nbr'].replace(to_replace='nan', value='standard')
train_data['sale_nbr'] = train_data['sale_nbr'].replace(to_replace='1.0', value='partial_split')
train_data['sale_nbr'] = train_data['sale_nbr'].replace(to_replace='6.0', value='sold_last_12mths')

column_analysis(train_data)

==========================================================================================
 DETAILED COLUMN ANALYSIS - ALL COLUMNS
==========================================================================================

 COLUMN: id
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       1 │       1 │ 0.00%        │ 0.00%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │       1 │ 0.00%        │ 0.00%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │       1 │ 0.00%        │ 0.00%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       7 │       1 │ 0.00%        │ 0.00%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       8 │       1 │ 0.00%        │ 0.01%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │      11 │       1 │ 0.00%        │ 0.01%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │      14 │       1 │ 0.00%        │ 0.01%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      15 │       1 │ 0.00%        │ 0.01%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │      17 │       1 │ 0.00%        │ 0.01%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      19 │       1 │ 0.00%        │ 0.01%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 83,483
• Missing values: 0 (0.00%)
• Most frequent: '1' (1 times)
• Least frequent: '19' (1 times)
• Minimum value: 1
• Maximum value: 199999
• All values are unique (potential ID column)

 COLUMN: sale_date
======================================================================
Data Type: datetime64[ns]
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value               │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ 1999-06-15 00:00:00 │     731 │ 0.88%        │ 0.88%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ 1999-03-15 00:00:00 │     692 │ 0.83%        │ 1.70%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ 1999-07-15 00:00:00 │     680 │ 0.81%        │ 2.52%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #4     │ 2000-06-15 00:00:00 │     661 │ 0.79%        │ 3.31%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #5     │ 1999-08-15 00:00:00 │     659 │ 0.79%        │ 4.10%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #6     │ 2004-06-15 00:00:00 │     648 │ 0.78%        │ 4.88%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #7     │ 1999-09-15 00:00:00 │     646 │ 0.77%        │ 5.65%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #8     │ 1999-05-15 00:00:00 │     640 │ 0.77%        │ 6.42%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #9     │ 1999-04-15 00:00:00 │     626 │ 0.75%        │ 7.17%          │
├────────┼─────────────────────┼─────────┼──────────────┼────────────────┤
│ #10    │ 2003-08-15 00:00:00 │     612 │ 0.73%        │ 7.90%          │
╘════════╧═════════════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 313
• Missing values: 0 (0.00%)
• Most frequent: '1999-06-15 00:00:00' (731 times)
• Least frequent: '2003-08-15 00:00:00' (612 times)

 COLUMN: sale_nbr
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤══════════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value            │ Count   │ Percentage   │ Cumulative %   │
╞════════╪══════════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ partial_split    │ 43,196  │ 51.74%       │ 51.74%         │
├────────┼──────────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ standard         │ 39,197  │ 46.95%       │ 98.69%         │
├────────┼──────────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ sold_last_12mths │ 1,090   │ 1.31%        │ 100.00%        │
╘════════╧══════════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 3
• Missing values: 0 (0.00%)
• Most frequent: 'partial_split' (43,196 times)
• Least frequent: 'sold_last_12mths' (1,090 times)
• Low cardinality (good for categorical analysis)

 COLUMN: sale_warning
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │         │ 76,909  │ 92.13%       │ 92.13%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ 26      │ 3,205   │ 3.84%        │ 95.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ 15      │ 1,415   │ 1.69%        │ 97.66%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ 40      │ 445     │ 0.53%        │ 98.19%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ 15 26   │ 310     │ 0.37%        │ 98.56%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ 56      │ 238     │ 0.29%        │ 98.85%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ 41      │ 173     │ 0.21%        │ 99.06%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ 29      │ 106     │ 0.13%        │ 99.18%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ 10      │ 76      │ 0.09%        │ 99.27%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ 60      │ 76      │ 0.09%        │ 99.37%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 88
• Missing values: 0 (0.00%)
• Most frequent: '   ' (76,909 times)
• Least frequent: ' 60 ' (76 times)

 COLUMN: join_status
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ nochg   │ 64,933  │ 77.78%       │ 77.78%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ new     │ 18,550  │ 22.22%       │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2
• Missing values: 0 (0.00%)
• Most frequent: 'nochg' (64,933 times)
• Least frequent: 'new' (18,550 times)
• Low cardinality (good for categorical analysis)

 COLUMN: join_year
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    2025 │ 83,483  │ 100.00%      │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 1
• Missing values: 0 (0.00%)
• Most frequent: '2025' (83,483 times)
• Least frequent: '2025' (83,483 times)
• Minimum value: 2025
• Maximum value: 2025
•  All values are the same (constant column)

 COLUMN: latitude
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ 47.6853 │      56 │ 0.07%        │ 0.07%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ 47.6882 │      55 │ 0.07%        │ 0.13%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ 47.6911 │      54 │ 0.06%        │ 0.20%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ 47.6721 │      54 │ 0.06%        │ 0.26%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ 47.6727 │      51 │ 0.06%        │ 0.32%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ 47.6901 │      51 │ 0.06%        │ 0.38%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ 47.6842 │      47 │ 0.06%        │ 0.44%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ 47.5671 │      47 │ 0.06%        │ 0.50%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ 47.5517 │      45 │ 0.05%        │ 0.55%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ 47.6919 │      45 │ 0.05%        │ 0.60%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5,589
• Missing values: 0 (0.00%)
• Most frequent: '47.6853' (56 times)
• Least frequent: '47.6919' (45 times)
• Minimum value: 47.1552
• Maximum value: 47.7778

 COLUMN: longitude
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤══════════╤═════════╤══════════════╤════════════════╕
│ Rank   │    Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪══════════╪═════════╪══════════════╪════════════════╡
│ #1     │ -122.351 │      75 │ 0.09%        │ 0.09%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #2     │ -122.349 │      63 │ 0.08%        │ 0.17%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #3     │ -122.362 │      62 │ 0.07%        │ 0.24%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #4     │ -122.288 │      61 │ 0.07%        │ 0.31%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #5     │ -122.308 │      59 │ 0.07%        │ 0.38%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #6     │ -122.29  │      59 │ 0.07%        │ 0.45%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #7     │ -122.363 │      57 │ 0.07%        │ 0.52%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #8     │ -122.314 │      57 │ 0.07%        │ 0.59%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #9     │ -122.3   │      55 │ 0.07%        │ 0.66%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #10    │ -122.356 │      55 │ 0.07%        │ 0.72%          │
╘════════╧══════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 6,332
• Missing values: 0 (0.00%)
• Most frequent: '-122.3509' (75 times)
• Least frequent: '-122.356' (55 times)
• Minimum value: -122.5272
• Maximum value: -121.1616

 COLUMN: area
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │      69 │ 2,197   │ 2.63%        │ 2.63%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │      35 │ 2,027   │ 2.43%        │ 5.06%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │      32 │ 1,565   │ 1.87%        │ 6.93%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │      37 │ 1,550   │ 1.86%        │ 8.79%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │      72 │ 1,543   │ 1.85%        │ 10.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │      56 │ 1,509   │ 1.81%        │ 12.45%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       6 │ 1,506   │ 1.80%        │ 14.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      53 │ 1,500   │ 1.80%        │ 16.05%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │      93 │ 1,457   │ 1.75%        │ 17.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      73 │ 1,388   │ 1.66%        │ 19.46%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 89
• Missing values: 0 (0.00%)
• Most frequent: '69' (2,197 times)
• Least frequent: '73' (1,388 times)
• Minimum value: 1
• Maximum value: 100

 COLUMN: city
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value       │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ SEATTLE     │ 23,874  │ 28.60%       │ 28.60%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ KING COUNTY │ 10,996  │ 13.17%       │ 41.77%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ BELLEVUE    │ 4,891   │ 5.86%        │ 47.63%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #4     │ KENT        │ 4,256   │ 5.10%        │ 52.73%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #5     │ SAMMAMISH   │ 4,186   │ 5.01%        │ 57.74%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #6     │ RENTON      │ 4,055   │ 4.86%        │ 62.60%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #7     │ FEDERAL WAY │ 3,596   │ 4.31%        │ 66.90%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #8     │ KIRKLAND    │ 3,519   │ 4.22%        │ 71.12%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #9     │ SHORELINE   │ 2,480   │ 2.97%        │ 74.09%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #10    │ AUBURN      │ 2,444   │ 2.93%        │ 77.02%         │
╘════════╧═════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 40
• Missing values: 0 (0.00%)
• Most frequent: 'SEATTLE' (23,874 times)
• Least frequent: 'AUBURN' (2,444 times)

 COLUMN: zoning
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ NR3     │ 12,795  │ 15.33%       │ 15.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ R6      │ 7,575   │ 9.07%        │ 24.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ R4      │ 4,774   │ 5.72%        │ 30.12%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ RA5     │ 3,778   │ 4.53%        │ 34.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ NR2     │ 3,498   │ 4.19%        │ 38.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ SR-6    │ 3,063   │ 3.67%        │ 42.50%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ R-5     │ 2,583   │ 3.09%        │ 45.60%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ R-6     │ 2,521   │ 3.02%        │ 48.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ RS7.2   │ 2,445   │ 2.93%        │ 51.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ NR      │ 2,311   │ 2.77%        │ 54.31%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 348
• Missing values: 0 (0.00%)
• Most frequent: 'NR3' (12,795 times)
• Least frequent: 'NR' (2,311 times)

 COLUMN: subdivision
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 76,229 | Missing: 7,254

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤════════════════════════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value                          │   Count │ Percentage   │ Cumulative %   │
╞════════╪════════════════════════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ MAPLE LEAF TO GREEN LAKE CI... │     312 │ 0.37%        │ 0.37%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ GILMAN PARK ADD BLKS 01 THR... │     288 │ 0.34%        │ 0.72%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ HOMECROFT ADD                  │     194 │ 0.23%        │ 0.95%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #4     │ CHEROKEE BAY PARK ASSESSORS... │     186 │ 0.22%        │ 1.17%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #5     │ GILMANS ADD BLKS 01 THRU 87    │     180 │ 0.22%        │ 1.39%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #6     │ SEA VIEW PARK                  │     171 │ 0.20%        │ 1.59%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #7     │ MC MICKEN HEIGHTS DIV NO. 02   │     154 │ 0.18%        │ 1.78%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #8     │ SALMON BAY PARK ADD            │     151 │ 0.18%        │ 1.96%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #9     │ STATE ADD TO SEATTLE NO. 04    │     139 │ 0.17%        │ 2.13%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #10    │ SOUTH PARK                     │     120 │ 0.14%        │ 2.27%          │
╘════════╧════════════════════════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 9,217
• Missing values: 7,254 (8.69%)
• Most frequent: 'MAPLE LEAF TO GREEN LAKE CIRCLE POR OF' (312 times)
• Least frequent: 'SOUTH PARK' (120 times)

 COLUMN: present_use
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       2 │ 76,334  │ 91.44%       │ 91.44%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │      29 │ 6,667   │ 7.99%        │ 99.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       6 │ 482     │ 0.58%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 3
• Missing values: 0 (0.00%)
• Most frequent: '2' (76,334 times)
• Least frequent: '6' (482 times)
• Minimum value: 2
• Maximum value: 29
• Low cardinality (good for categorical analysis)

 COLUMN: land_val
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │  207000 │     613 │ 0.73%        │ 0.73%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │  216000 │     603 │ 0.72%        │ 1.46%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       0 │     524 │ 0.63%        │ 2.08%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │  177000 │     497 │ 0.60%        │ 2.68%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │  135000 │     461 │ 0.55%        │ 3.23%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │  486000 │     457 │ 0.55%        │ 3.78%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │  196000 │     454 │ 0.54%        │ 4.32%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │  243000 │     445 │ 0.53%        │ 4.86%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │  268000 │     429 │ 0.51%        │ 5.37%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │  225000 │     423 │ 0.51%        │ 5.88%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2,289
• Missing values: 0 (0.00%)
• Most frequent: '207000' (613 times)
• Least frequent: '225000' (423 times)
• Minimum value: 0
• Maximum value: 10037000

 COLUMN: imp_val
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1000 │ 1,452   │ 1.74%        │ 1.74%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       0 │ 543     │ 0.65%        │ 2.39%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │  333000 │ 228     │ 0.27%        │ 2.66%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │  375000 │ 224     │ 0.27%        │ 2.93%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │  370000 │ 224     │ 0.27%        │ 3.20%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │  398000 │ 222     │ 0.27%        │ 3.47%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │  344000 │ 222     │ 0.27%        │ 3.73%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │  390000 │ 212     │ 0.25%        │ 3.99%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │  362000 │ 210     │ 0.25%        │ 4.24%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │  359000 │ 210     │ 0.25%        │ 4.49%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2,234
• Missing values: 0 (0.00%)
• Most frequent: '1000' (1,452 times)
• Least frequent: '359000' (210 times)
• Minimum value: 0
• Maximum value: 6653000

 COLUMN: year_built
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1977 │ 1,687   │ 2.02%        │ 2.02%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1978 │ 1,674   │ 2.01%        │ 4.03%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1968 │ 1,611   │ 1.93%        │ 5.96%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1990 │ 1,510   │ 1.81%        │ 7.76%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1967 │ 1,475   │ 1.77%        │ 9.53%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1989 │ 1,416   │ 1.70%        │ 11.23%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1962 │ 1,385   │ 1.66%        │ 12.89%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1987 │ 1,385   │ 1.66%        │ 14.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    1979 │ 1,358   │ 1.63%        │ 16.17%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1988 │ 1,233   │ 1.48%        │ 17.65%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 125
• Missing values: 0 (0.00%)
• Most frequent: '1977' (1,687 times)
• Least frequent: '1988' (1,233 times)
• Minimum value: 1900
• Maximum value: 2024

 COLUMN: year_reno
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,476  │ 99.99%       │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    2022 │ 2       │ 0.00%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    2024 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    2017 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    2020 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    2009 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    2023 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 7
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,476 times)
• Least frequent: '2023' (1 times)
• Minimum value: 0
• Maximum value: 2024
• Low cardinality (good for categorical analysis)

 COLUMN: sqft_lot
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    5000 │ 1,249   │ 1.50%        │ 1.50%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    6000 │ 1,059   │ 1.27%        │ 2.76%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    7200 │ 918     │ 1.10%        │ 3.86%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    4000 │ 838     │ 1.00%        │ 4.87%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    7500 │ 497     │ 0.60%        │ 5.46%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    9600 │ 474     │ 0.57%        │ 6.03%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    8400 │ 472     │ 0.57%        │ 6.60%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    4800 │ 414     │ 0.50%        │ 7.09%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    4500 │ 318     │ 0.38%        │ 7.47%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    9000 │ 306     │ 0.37%        │ 7.84%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 19,966
• Missing values: 0 (0.00%)
• Most frequent: '5000' (1,249 times)
• Least frequent: '9000' (306 times)
• Minimum value: 381
• Maximum value: 2076940

 COLUMN: sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1800 │     546 │ 0.65%        │ 0.65%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1300 │     534 │ 0.64%        │ 1.29%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1200 │     488 │ 0.58%        │ 1.88%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1660 │     477 │ 0.57%        │ 2.45%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1440 │     476 │ 0.57%        │ 3.02%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1700 │     475 │ 0.57%        │ 3.59%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    2020 │     473 │ 0.57%        │ 4.16%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1900 │     471 │ 0.56%        │ 4.72%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    2000 │     471 │ 0.56%        │ 5.28%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1820 │     469 │ 0.56%        │ 5.85%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 1,718
• Missing values: 0 (0.00%)
• Most frequent: '1800' (546 times)
• Least frequent: '1820' (469 times)
• Minimum value: 200
• Maximum value: 13310

 COLUMN: sqft_1
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1010 │ 1,117   │ 1.34%        │ 1.34%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1200 │ 1,082   │ 1.30%        │ 2.63%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1300 │ 1,056   │ 1.26%        │ 3.90%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1250 │ 1,042   │ 1.25%        │ 5.15%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1080 │ 1,002   │ 1.20%        │ 6.35%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1060 │ 992     │ 1.19%        │ 7.54%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1090 │ 951     │ 1.14%        │ 8.67%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1040 │ 915     │ 1.10%        │ 9.77%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    1120 │ 912     │ 1.09%        │ 10.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1180 │ 904     │ 1.08%        │ 11.95%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 1,226
• Missing values: 0 (0.00%)
• Most frequent: '1010' (1,117 times)
• Least frequent: '1180' (904 times)
• Minimum value: 80
• Maximum value: 7760

 COLUMN: sqft_fbsmt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 48,555  │ 58.16%       │ 58.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     500 │ 869     │ 1.04%        │ 59.20%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     600 │ 800     │ 0.96%        │ 60.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     400 │ 779     │ 0.93%        │ 61.09%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     700 │ 689     │ 0.83%        │ 61.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     800 │ 673     │ 0.81%        │ 62.73%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1000 │ 599     │ 0.72%        │ 63.44%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     300 │ 571     │ 0.68%        │ 64.13%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     900 │ 563     │ 0.67%        │ 64.80%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     480 │ 450     │ 0.54%        │ 65.34%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 499
• Missing values: 0 (0.00%)
• Most frequent: '0' (48,555 times)
• Least frequent: '480' (450 times)
• Minimum value: 0
• Maximum value: 5110

 COLUMN: grade
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       7 │ 34,397  │ 41.20%       │ 41.20%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       8 │ 25,950  │ 31.08%       │ 72.29%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       9 │ 10,566  │ 12.66%       │ 84.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       6 │ 6,984   │ 8.37%        │ 93.31%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │      10 │ 3,600   │ 4.31%        │ 97.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │      11 │ 1,030   │ 1.23%        │ 98.85%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       5 │ 693     │ 0.83%        │ 99.68%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      12 │ 213     │ 0.26%        │ 99.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       4 │ 31      │ 0.04%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      13 │ 14      │ 0.02%        │ 99.99%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '7' (34,397 times)
• Least frequent: '13' (14 times)
• Minimum value: 1
• Maximum value: 13

 COLUMN: fbsmt_grade
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 48,555  │ 58.16%       │ 58.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       7 │ 14,941  │ 17.90%       │ 76.06%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       6 │ 8,234   │ 9.86%        │ 85.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       8 │ 7,416   │ 8.88%        │ 94.80%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       9 │ 2,193   │ 2.63%        │ 97.43%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 1,267   │ 1.52%        │ 98.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │      10 │ 529     │ 0.63%        │ 99.58%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      11 │ 156     │ 0.19%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       4 │ 144     │ 0.17%        │ 99.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      12 │ 22      │ 0.03%        │ 99.97%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '0' (48,555 times)
• Least frequent: '12' (22 times)
• Minimum value: 0
• Maximum value: 13

 COLUMN: condition
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       3 │ 49,235  │ 58.98%       │ 58.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       4 │ 25,571  │ 30.63%       │ 89.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       5 │ 8,373   │ 10.03%       │ 99.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       2 │ 269     │ 0.32%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       1 │ 35      │ 0.04%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '3' (49,235 times)
• Least frequent: '1' (35 times)
• Minimum value: 1
• Maximum value: 5
• Low cardinality (good for categorical analysis)

 COLUMN: stories
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │     1   │ 40,955  │ 49.06%       │ 49.06%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     2   │ 33,146  │ 39.70%       │ 88.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     1.5 │ 6,701   │ 8.03%        │ 96.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     3   │ 2,156   │ 2.58%        │ 99.37%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     2.5 │ 462     │ 0.55%        │ 99.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     4   │ 45      │ 0.05%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     3.5 │ 17      │ 0.02%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     4.5 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '1.0' (40,955 times)
• Least frequent: '4.5' (1 times)
• Minimum value: 1.0
• Maximum value: 4.5
• Low cardinality (good for categorical analysis)

 COLUMN: beds
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       3 │ 37,521  │ 44.94%       │ 44.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       4 │ 28,395  │ 34.01%       │ 78.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 9,122   │ 10.93%       │ 89.88%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       5 │ 6,869   │ 8.23%        │ 98.11%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       6 │ 854     │ 1.02%        │ 99.14%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       1 │ 548     │ 0.66%        │ 99.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       7 │ 96      │ 0.11%        │ 99.91%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       8 │ 36      │ 0.04%        │ 99.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       0 │ 27      │ 0.03%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │       9 │ 7       │ 0.01%        │ 99.99%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '3' (37,521 times)
• Least frequent: '9' (7 times)
• Minimum value: 0
• Maximum value: 13

 COLUMN: bath_full
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       1 │ 43,834  │ 52.51%       │ 52.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 33,585  │ 40.23%       │ 92.74%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 4,743   │ 5.68%        │ 98.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       0 │ 819     │ 0.98%        │ 99.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 471     │ 0.56%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 28      │ 0.03%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       6 │ 2       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       9 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '1' (43,834 times)
• Least frequent: '9' (1 times)
• Minimum value: 0
• Maximum value: 9
• Low cardinality (good for categorical analysis)

 COLUMN: bath_3qtr
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 45,360  │ 54.33%       │ 54.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 32,143  │ 38.50%       │ 92.84%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 5,655   │ 6.77%        │ 99.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 294     │ 0.35%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 24      │ 0.03%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 7       │ 0.01%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 6
• Missing values: 0 (0.00%)
• Most frequent: '0' (45,360 times)
• Least frequent: '5' (7 times)
• Minimum value: 0
• Maximum value: 5
• Low cardinality (good for categorical analysis)

 COLUMN: bath_half
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 44,455  │ 53.25%       │ 53.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 38,151  │ 45.70%       │ 98.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 847     │ 1.01%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 25      │ 0.03%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       5 │ 3       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       4 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │      12 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 7
• Missing values: 0 (0.00%)
• Most frequent: '0' (44,455 times)
• Least frequent: '12' (1 times)
• Minimum value: 0
• Maximum value: 12
• Low cardinality (good for categorical analysis)

 COLUMN: garb_sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 64,516  │ 77.28%       │ 77.28%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     200 │ 1,107   │ 1.33%        │ 78.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     240 │ 1,039   │ 1.24%        │ 79.85%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     290 │ 778     │ 0.93%        │ 80.78%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     220 │ 742     │ 0.89%        │ 81.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     260 │ 728     │ 0.87%        │ 82.54%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     480 │ 646     │ 0.77%        │ 83.32%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     310 │ 554     │ 0.66%        │ 83.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     300 │ 499     │ 0.60%        │ 84.58%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     180 │ 491     │ 0.59%        │ 85.17%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 248
• Missing values: 0 (0.00%)
• Most frequent: '0' (64,516 times)
• Least frequent: '180' (491 times)
• Minimum value: 0
• Maximum value: 4000

 COLUMN: gara_sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 37,014  │ 44.34%       │ 44.34%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     440 │ 3,486   │ 4.18%        │ 48.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     480 │ 3,034   │ 3.63%        │ 52.15%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     400 │ 2,240   │ 2.68%        │ 54.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     460 │ 2,239   │ 2.68%        │ 57.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     420 │ 2,142   │ 2.57%        │ 60.08%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     530 │ 1,383   │ 1.66%        │ 61.73%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     500 │ 1,324   │ 1.59%        │ 63.32%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     510 │ 900     │ 1.08%        │ 64.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     550 │ 882     │ 1.06%        │ 65.46%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 522
• Missing values: 0 (0.00%)
• Most frequent: '0' (37,014 times)
• Least frequent: '550' (882 times)
• Minimum value: 0
• Maximum value: 4404

 COLUMN: wfnt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,598  │ 98.94%       │ 98.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       8 │ 325     │ 0.39%        │ 99.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       9 │ 187     │ 0.22%        │ 99.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 173     │ 0.21%        │ 99.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       6 │ 124     │ 0.15%        │ 99.91%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       7 │ 62      │ 0.07%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       1 │ 8       │ 0.01%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       5 │ 6       │ 0.01%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,598 times)
• Least frequent: '5' (6 times)
• Minimum value: 0
• Maximum value: 9
• Low cardinality (good for categorical analysis)

 COLUMN: golf
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,062  │ 99.50%       │ 99.50%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 421     │ 0.50%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,062 times)
• Least frequent: '1' (421 times)
• Minimum value: 0
• Maximum value: 1
• Low cardinality (good for categorical analysis)

 COLUMN: greenbelt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,087  │ 97.13%       │ 97.13%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 2,396   │ 2.87%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,087 times)
• Least frequent: '1' (2,396 times)
• Minimum value: 0
• Maximum value: 1
• Low cardinality (good for categorical analysis)

 COLUMN: noise_traffic
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 72,472  │ 86.81%       │ 86.81%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 6,704   │ 8.03%        │ 94.84%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 3,690   │ 4.42%        │ 99.26%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 617     │ 0.74%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (72,472 times)
• Least frequent: '3' (617 times)
• Minimum value: 0
• Maximum value: 3
• Low cardinality (good for categorical analysis)

 COLUMN: view_rainier
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,905  │ 99.31%       │ 99.31%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 300     │ 0.36%        │ 99.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 241     │ 0.29%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 37      │ 0.04%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,905 times)
• Least frequent: '4' (37 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_olympics
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,768  │ 97.95%       │ 97.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 1,038   │ 1.24%        │ 99.19%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 460     │ 0.55%        │ 99.74%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 217     │ 0.26%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,768 times)
• Least frequent: '4' (217 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_cascades
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,535  │ 97.67%       │ 97.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 1,324   │ 1.59%        │ 99.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 508     │ 0.61%        │ 99.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 116     │ 0.14%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,535 times)
• Least frequent: '4' (116 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_territorial
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 76,605  │ 91.76%       │ 91.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 4,335   │ 5.19%        │ 96.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 1,867   │ 2.24%        │ 99.19%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 676     │ 0.81%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (76,605 times)
• Least frequent: '4' (676 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_skyline
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,962  │ 99.38%       │ 99.38%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 319     │ 0.38%        │ 99.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 121     │ 0.14%        │ 99.90%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 81      │ 0.10%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,962 times)
• Least frequent: '4' (81 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_sound
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,420  │ 97.53%       │ 97.53%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 751     │ 0.90%        │ 98.43%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 573     │ 0.69%        │ 99.11%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 464     │ 0.56%        │ 99.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 275     │ 0.33%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,420 times)
• Least frequent: '4' (275 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_lakewash
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,665  │ 97.82%       │ 97.82%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 723     │ 0.87%        │ 98.69%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 547     │ 0.66%        │ 99.34%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 352     │ 0.42%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 196     │ 0.23%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,665 times)
• Least frequent: '4' (196 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_lakesamm
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,999  │ 99.42%       │ 99.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 151     │ 0.18%        │ 99.60%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       1 │ 143     │ 0.17%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 105     │ 0.13%        │ 99.90%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 85      │ 0.10%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,999 times)
• Least frequent: '4' (85 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_otherwater
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,858  │ 99.25%       │ 99.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 302     │ 0.36%        │ 99.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 167     │ 0.20%        │ 99.81%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 156     │ 0.19%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,858 times)
• Least frequent: '4' (156 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_other
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,044  │ 99.47%       │ 99.47%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 324     │ 0.39%        │ 99.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 95      │ 0.11%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 20      │ 0.02%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,044 times)
• Least frequent: '4' (20 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: submarket
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 82,682 | Missing: 801

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ K       │ 9,038   │ 10.83%       │ 10.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ I       │ 7,345   │ 8.80%        │ 19.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ B       │ 6,733   │ 8.07%        │ 27.69%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ R       │ 6,659   │ 7.98%        │ 35.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ Q       │ 6,071   │ 7.27%        │ 42.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ O       │ 5,268   │ 6.31%        │ 49.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ F       │ 4,760   │ 5.70%        │ 54.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ D       │ 4,732   │ 5.67%        │ 60.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ C       │ 4,392   │ 5.26%        │ 65.88%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ L       │ 4,212   │ 5.05%        │ 70.92%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 19
• Missing values: 801 (0.96%)
• Most frequent: 'K' (9,038 times)
• Least frequent: 'L' (4,212 times)

 COLUMN: adjusted_sale_price
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │  458781 │      64 │ 0.08%        │ 0.08%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │  607575 │      57 │ 0.07%        │ 0.14%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │  561734 │      57 │ 0.07%        │ 0.21%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │  533178 │      55 │ 0.07%        │ 0.28%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │  601208 │      54 │ 0.06%        │ 0.34%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │  509330 │      53 │ 0.06%        │ 0.41%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │  557977 │      52 │ 0.06%        │ 0.47%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │  608383 │      52 │ 0.06%        │ 0.53%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │  483580 │      52 │ 0.06%        │ 0.59%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │  741490 │      51 │ 0.06%        │ 0.66%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 33,677
• Missing values: 0 (0.00%)
• Most frequent: '458781' (64 times)
• Least frequent: '741490' (51 times)
• Minimum value: 87189
• Maximum value: 7315704

sale_warning_codes = {
    1: "Personal Property Included",
    2: "1031 Trade",
    3: "Contract Or Cash Sale",
    4: "Presale",
    5: "Full Sales Price Not Reported",
    6: "Refund",
    7: "Questionable Per Sales Identification",
    8: "Questionable Per Appraisal",
    9: "Questionable Per Mainframe System (Obsolete Code)",
    10: "Tear Down",
    11: "Corporate Affiliates",
    12: "Estate Administrator, Guardian, Or Executor",
    13: "Bankruptcy - Receiver Or Trustee",
    14: "Sheriff / Tax Sale",
    15: "No Market Exposure",
    16: "Government Agency",
    17: "Non-Profit Organization",
    18: "Quit Claim Deed",
    19: "Seller'S Or Purchaser'S Assignment",
    20: "Correction Deed",
    21: "Trade",
    22: "Partial Interest (1/3, 1/2, Etc.)",
    23: "Forced Sale",
    24: "Easement Or Right-Of-Way",
    25: "Fulfillment Of Contract Deed",
    26: "Imp. Characteristics Changed Since Sale",
    27: "Timber And Forest Land",
    28: "New Plat (With Less Than 20% Sold)",
    29: "Segregation And/Or Merger",
    30: "Historic Property",
    31: "Exempt From Excise Tax",
    32: "$1,000 Sale Or Less",
    33: "Lease Or Lease-Hold",
    34: "Change Of Use",
    35: "Open Space Designation Continued/Ok'D After Sale",
    36: "Plottage",
    37: "Securing Of Debt",
    38: "Divorce",
    39: "Assumption Of Mortgage W/No Addl Consideration Pd",
    40: "Relocation - Sale To Service",
    41: "Relocation - Sale By Service",
    42: "Development Rights To Cnty,Cty,Or Prvt Developer",
    43: "Development Rights Parcel To Prvt Sector",
    44: "Tenant",
    45: "Multi-Parcel Sale",
    46: "Non-Representative Sale",
    47: "Non-Conventional Heating System",
    48: "Condo With Garage, Moorage, Or Storage",
    49: "Mobile Home",
    50: "Condo Wholesale",
    51: "Related Party, Friend, Or Neighbor",
    52: "Statement To Dor",
    53: "Residual Sales",
    54: "Affordable Housing Sales",
    55: "Shell",
    56: "Builder Or Developer Sales",
    57: "Selling Or Buying Costs Affecting Sale Price",
    58: "Preliminary Shortplat Approval",
    59: "Bulk Portfolio Sale",
    60: "Short Sale",
    61: "Financial Institution Resale",
    62: "Auction Sale"
}

# Select most predictive warning features to reduce dimensionality
train_data = extract_sale_warning_codes(train_data)
warning_features = [col for col in train_data.columns if col.startswith('sale_warning_')]
selector = SelectKBest(f_regression, k=20)  # Keep top 20 warning features
X_warnings_selected = selector.fit_transform(train_data[warning_features], train_data['adjusted_sale_price'])
selected_warnings = list(selector.get_feature_names_out())
selected_warnings

['sale_warning_3',
 'sale_warning_4',
 'sale_warning_10',
 'sale_warning_15',
 'sale_warning_16',
 'sale_warning_24',
 'sale_warning_26',
 'sale_warning_28',
 'sale_warning_29',
 'sale_warning_30',
 'sale_warning_34',
 'sale_warning_35',
 'sale_warning_36',
 'sale_warning_41',
 'sale_warning_44',
 'sale_warning_54',
 'sale_warning_56',
 'sale_warning_57',
 'sale_warning_58',
 'sale_warning_60']

column_analysis(train_data[geo_data])

==========================================================================================
 DETAILED COLUMN ANALYSIS - ALL COLUMNS
==========================================================================================

 COLUMN: latitude
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ 47.6853 │      56 │ 0.07%        │ 0.07%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ 47.6882 │      55 │ 0.07%        │ 0.13%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ 47.6911 │      54 │ 0.06%        │ 0.20%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ 47.6721 │      54 │ 0.06%        │ 0.26%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ 47.6727 │      51 │ 0.06%        │ 0.32%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ 47.6901 │      51 │ 0.06%        │ 0.38%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ 47.6842 │      47 │ 0.06%        │ 0.44%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ 47.5671 │      47 │ 0.06%        │ 0.50%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ 47.5517 │      45 │ 0.05%        │ 0.55%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ 47.6919 │      45 │ 0.05%        │ 0.60%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5,589
• Missing values: 0 (0.00%)
• Most frequent: '47.6853' (56 times)
• Least frequent: '47.6919' (45 times)
• Minimum value: 47.1552
• Maximum value: 47.7778

 COLUMN: longitude
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤══════════╤═════════╤══════════════╤════════════════╕
│ Rank   │    Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪══════════╪═════════╪══════════════╪════════════════╡
│ #1     │ -122.351 │      75 │ 0.09%        │ 0.09%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #2     │ -122.349 │      63 │ 0.08%        │ 0.17%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #3     │ -122.362 │      62 │ 0.07%        │ 0.24%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #4     │ -122.288 │      61 │ 0.07%        │ 0.31%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #5     │ -122.308 │      59 │ 0.07%        │ 0.38%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #6     │ -122.29  │      59 │ 0.07%        │ 0.45%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #7     │ -122.363 │      57 │ 0.07%        │ 0.52%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #8     │ -122.314 │      57 │ 0.07%        │ 0.59%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #9     │ -122.3   │      55 │ 0.07%        │ 0.66%          │
├────────┼──────────┼─────────┼──────────────┼────────────────┤
│ #10    │ -122.356 │      55 │ 0.07%        │ 0.72%          │
╘════════╧══════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 6,332
• Missing values: 0 (0.00%)
• Most frequent: '-122.3509' (75 times)
• Least frequent: '-122.356' (55 times)
• Minimum value: -122.5272
• Maximum value: -121.1616

 COLUMN: area
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │      69 │ 2,197   │ 2.63%        │ 2.63%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │      35 │ 2,027   │ 2.43%        │ 5.06%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │      32 │ 1,565   │ 1.87%        │ 6.93%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │      37 │ 1,550   │ 1.86%        │ 8.79%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │      72 │ 1,543   │ 1.85%        │ 10.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │      56 │ 1,509   │ 1.81%        │ 12.45%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       6 │ 1,506   │ 1.80%        │ 14.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      53 │ 1,500   │ 1.80%        │ 16.05%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │      93 │ 1,457   │ 1.75%        │ 17.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      73 │ 1,388   │ 1.66%        │ 19.46%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 89
• Missing values: 0 (0.00%)
• Most frequent: '69' (2,197 times)
• Least frequent: '73' (1,388 times)
• Minimum value: 1
• Maximum value: 100

 COLUMN: city
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value       │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ SEATTLE     │ 23,874  │ 28.60%       │ 28.60%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ KING COUNTY │ 10,996  │ 13.17%       │ 41.77%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ BELLEVUE    │ 4,891   │ 5.86%        │ 47.63%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #4     │ KENT        │ 4,256   │ 5.10%        │ 52.73%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #5     │ SAMMAMISH   │ 4,186   │ 5.01%        │ 57.74%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #6     │ RENTON      │ 4,055   │ 4.86%        │ 62.60%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #7     │ FEDERAL WAY │ 3,596   │ 4.31%        │ 66.90%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #8     │ KIRKLAND    │ 3,519   │ 4.22%        │ 71.12%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #9     │ SHORELINE   │ 2,480   │ 2.97%        │ 74.09%         │
├────────┼─────────────┼─────────┼──────────────┼────────────────┤
│ #10    │ AUBURN      │ 2,444   │ 2.93%        │ 77.02%         │
╘════════╧═════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 40
• Missing values: 0 (0.00%)
• Most frequent: 'SEATTLE' (23,874 times)
• Least frequent: 'AUBURN' (2,444 times)

 COLUMN: submarket
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 82,682 | Missing: 801

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ K       │ 9,038   │ 10.83%       │ 10.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ I       │ 7,345   │ 8.80%        │ 19.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ B       │ 6,733   │ 8.07%        │ 27.69%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ R       │ 6,659   │ 7.98%        │ 35.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ Q       │ 6,071   │ 7.27%        │ 42.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ O       │ 5,268   │ 6.31%        │ 49.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ F       │ 4,760   │ 5.70%        │ 54.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ D       │ 4,732   │ 5.67%        │ 60.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ C       │ 4,392   │ 5.26%        │ 65.88%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ L       │ 4,212   │ 5.05%        │ 70.92%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 19
• Missing values: 801 (0.96%)
• Most frequent: 'K' (9,038 times)
• Least frequent: 'L' (4,212 times)

train_data[geo_data].head(10)

# Confirm that all latitude and longitude points fall within King County, WA

def plot_points_with_king_county_boundary(lat_col, lon_col):
    """
    Plots latitude and longitude points over the actual King County boundary from OpenStreetMap.

    Parameters:
    lat_col (pd.Series): Latitude values.
    lon_col (pd.Series): Longitude values.
    """
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Plot property points
    ax.scatter(lon_col, lat_col, c='blue', label='Property Points', s=5, alpha=0.6, zorder=3)

    king_county = ox.geocode_to_gdf("King County, Washington, USA")
    king_county.plot(ax=ax, color='none', edgecolor='red', linewidth=2, zorder=2, label='King County Boundary')

    # Adjust plot limits
    ax.set_xlim(lon_col.min() - 0.05, lon_col.max() + 0.05)
    ax.set_ylim(lat_col.min() - 0.05, lat_col.max() + 0.05)
    
    # Labeling
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('Latitude and Longitude Points with King County Boundary')
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.5)
    plt.show()

plot_points_with_king_county_boundary(train_data.latitude, train_data.longitude)

len(train_data[train_data.longitude > -121.7])

76

train_data.city.sort_values().unique()

array(['ALGONA', 'AUBURN', 'BEAUX ARTS', 'BELLEVUE', 'BLACK DIAMOND',
       'BOTHELL', 'BURIEN', 'CARNATION', 'CLYDE HILL', 'COVINGTON',
       'DES MOINES', 'DUVALL', 'ENUMCLAW', 'FEDERAL WAY', 'HUNTS POINT',
       'ISSAQUAH', 'KENMORE', 'KENT', 'KING COUNTY', 'KIRKLAND',
       'LAKE FOREST PARK', 'MAPLE VALLEY', 'MEDINA', 'MERCER ISLAND',
       'MILTON', 'NEWCASTLE', 'NORMANDY PARK', 'NORTH BEND', 'PACIFIC',
       'REDMOND', 'RENTON', 'SAMMAMISH', 'SEATTLE', 'SHORELINE',
       'SKYKOMISH', 'SNOQUALMIE', 'SeaTac', 'TUKWILA', 'WOODINVILLE',
       'YARROW POINT'], dtype=object)

# Clean up value for Sea-Tac
train_data['city'] = train_data.city.replace('SeaTac', 'SEA-TAC')
train_data.city.sort_values().unique()

array(['ALGONA', 'AUBURN', 'BEAUX ARTS', 'BELLEVUE', 'BLACK DIAMOND',
       'BOTHELL', 'BURIEN', 'CARNATION', 'CLYDE HILL', 'COVINGTON',
       'DES MOINES', 'DUVALL', 'ENUMCLAW', 'FEDERAL WAY', 'HUNTS POINT',
       'ISSAQUAH', 'KENMORE', 'KENT', 'KING COUNTY', 'KIRKLAND',
       'LAKE FOREST PARK', 'MAPLE VALLEY', 'MEDINA', 'MERCER ISLAND',
       'MILTON', 'NEWCASTLE', 'NORMANDY PARK', 'NORTH BEND', 'PACIFIC',
       'REDMOND', 'RENTON', 'SAMMAMISH', 'SEA-TAC', 'SEATTLE',
       'SHORELINE', 'SKYKOMISH', 'SNOQUALMIE', 'TUKWILA', 'WOODINVILLE',
       'YARROW POINT'], dtype=object)

train_data.submarket.sort_values().unique()

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', nan], dtype=object)

train_data['submarket'] = train_data.submarket.fillna('Unknown')

train_data.submarket.value_counts()

submarket
K          9038
I          7345
B          6733
R          6659
Q          6071
O          5268
F          4760
D          4732
C          4392
L          4212
M          3905
A          3616
N          3531
P          2912
E          2839
G          2799
J          2310
S          1124
Unknown     801
H           436
Name: count, dtype: int64

column_analysis(train_data[legal_data])

==========================================================================================
 DETAILED COLUMN ANALYSIS - ALL COLUMNS
==========================================================================================

 COLUMN: zoning
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value   │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │ NR3     │ 12,795  │ 15.33%       │ 15.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │ R6      │ 7,575   │ 9.07%        │ 24.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │ R4      │ 4,774   │ 5.72%        │ 30.12%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │ RA5     │ 3,778   │ 4.53%        │ 34.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │ NR2     │ 3,498   │ 4.19%        │ 38.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │ SR-6    │ 3,063   │ 3.67%        │ 42.50%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │ R-5     │ 2,583   │ 3.09%        │ 45.60%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │ R-6     │ 2,521   │ 3.02%        │ 48.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │ RS7.2   │ 2,445   │ 2.93%        │ 51.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │ NR      │ 2,311   │ 2.77%        │ 54.31%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 348
• Missing values: 0 (0.00%)
• Most frequent: 'NR3' (12,795 times)
• Least frequent: 'NR' (2,311 times)

 COLUMN: subdivision
======================================================================
Data Type: object
Total Rows: 83,483 | Non-Missing: 76,229 | Missing: 7,254

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤════════════════════════════════╤═════════╤══════════════╤════════════════╕
│ Rank   │ Value                          │   Count │ Percentage   │ Cumulative %   │
╞════════╪════════════════════════════════╪═════════╪══════════════╪════════════════╡
│ #1     │ MAPLE LEAF TO GREEN LAKE CI... │     312 │ 0.37%        │ 0.37%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #2     │ GILMAN PARK ADD BLKS 01 THR... │     288 │ 0.34%        │ 0.72%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #3     │ HOMECROFT ADD                  │     194 │ 0.23%        │ 0.95%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #4     │ CHEROKEE BAY PARK ASSESSORS... │     186 │ 0.22%        │ 1.17%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #5     │ GILMANS ADD BLKS 01 THRU 87    │     180 │ 0.22%        │ 1.39%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #6     │ SEA VIEW PARK                  │     171 │ 0.20%        │ 1.59%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #7     │ MC MICKEN HEIGHTS DIV NO. 02   │     154 │ 0.18%        │ 1.78%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #8     │ SALMON BAY PARK ADD            │     151 │ 0.18%        │ 1.96%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #9     │ STATE ADD TO SEATTLE NO. 04    │     139 │ 0.17%        │ 2.13%          │
├────────┼────────────────────────────────┼─────────┼──────────────┼────────────────┤
│ #10    │ SOUTH PARK                     │     120 │ 0.14%        │ 2.27%          │
╘════════╧════════════════════════════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 9,217
• Missing values: 7,254 (8.69%)
• Most frequent: 'MAPLE LEAF TO GREEN LAKE CIRCLE POR OF' (312 times)
• Least frequent: 'SOUTH PARK' (120 times)

 COLUMN: present_use
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       2 │ 76,334  │ 91.44%       │ 91.44%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │      29 │ 6,667   │ 7.99%        │ 99.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       6 │ 482     │ 0.58%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 3
• Missing values: 0 (0.00%)
• Most frequent: '2' (76,334 times)
• Least frequent: '6' (482 times)
• Minimum value: 2
• Maximum value: 29
• Low cardinality (good for categorical analysis)

### Create zoning_categories to group similar zones
train_data['zoning_category'] = train_data['zoning'].apply(categorize_zoning)
train_data['zoning_category'].value_counts()

zoning_category
Residential Zones           39669
Neighborhood Residential    18862
Industrial and Other        12123
Low-Rise Residential         5946
Special Use Zones            5346
Mixed Use and Commercial     1537
Name: count, dtype: int64

### Drop present_use and subdivision in favor of the consolidated zoning categories
legal_features_drop = ['present_use', 'subdivision']

column_analysis(train_data[assessor_data])

==========================================================================================
 DETAILED COLUMN ANALYSIS - ALL COLUMNS
==========================================================================================

 COLUMN: land_val
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │  207000 │     613 │ 0.73%        │ 0.73%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │  216000 │     603 │ 0.72%        │ 1.46%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       0 │     524 │ 0.63%        │ 2.08%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │  177000 │     497 │ 0.60%        │ 2.68%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │  135000 │     461 │ 0.55%        │ 3.23%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │  486000 │     457 │ 0.55%        │ 3.78%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │  196000 │     454 │ 0.54%        │ 4.32%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │  243000 │     445 │ 0.53%        │ 4.86%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │  268000 │     429 │ 0.51%        │ 5.37%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │  225000 │     423 │ 0.51%        │ 5.88%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2,289
• Missing values: 0 (0.00%)
• Most frequent: '207000' (613 times)
• Least frequent: '225000' (423 times)
• Minimum value: 0
• Maximum value: 10037000

 COLUMN: imp_val
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1000 │ 1,452   │ 1.74%        │ 1.74%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       0 │ 543     │ 0.65%        │ 2.39%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │  333000 │ 228     │ 0.27%        │ 2.66%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │  375000 │ 224     │ 0.27%        │ 2.93%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │  370000 │ 224     │ 0.27%        │ 3.20%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │  398000 │ 222     │ 0.27%        │ 3.47%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │  344000 │ 222     │ 0.27%        │ 3.73%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │  390000 │ 212     │ 0.25%        │ 3.99%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │  362000 │ 210     │ 0.25%        │ 4.24%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │  359000 │ 210     │ 0.25%        │ 4.49%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2,234
• Missing values: 0 (0.00%)
• Most frequent: '1000' (1,452 times)
• Least frequent: '359000' (210 times)
• Minimum value: 0
• Maximum value: 6653000

 COLUMN: grade
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       7 │ 34,397  │ 41.20%       │ 41.20%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       8 │ 25,950  │ 31.08%       │ 72.29%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       9 │ 10,566  │ 12.66%       │ 84.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       6 │ 6,984   │ 8.37%        │ 93.31%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │      10 │ 3,600   │ 4.31%        │ 97.62%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │      11 │ 1,030   │ 1.23%        │ 98.85%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       5 │ 693     │ 0.83%        │ 99.68%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      12 │ 213     │ 0.26%        │ 99.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       4 │ 31      │ 0.04%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      13 │ 14      │ 0.02%        │ 99.99%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '7' (34,397 times)
• Least frequent: '13' (14 times)
• Minimum value: 1
• Maximum value: 13

 COLUMN: fbsmt_grade
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 48,555  │ 58.16%       │ 58.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       7 │ 14,941  │ 17.90%       │ 76.06%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       6 │ 8,234   │ 9.86%        │ 85.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       8 │ 7,416   │ 8.88%        │ 94.80%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       9 │ 2,193   │ 2.63%        │ 97.43%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 1,267   │ 1.52%        │ 98.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │      10 │ 529     │ 0.63%        │ 99.58%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │      11 │ 156     │ 0.19%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       4 │ 144     │ 0.17%        │ 99.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │      12 │ 22      │ 0.03%        │ 99.97%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '0' (48,555 times)
• Least frequent: '12' (22 times)
• Minimum value: 0
• Maximum value: 13

 COLUMN: condition
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       3 │ 49,235  │ 58.98%       │ 58.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       4 │ 25,571  │ 30.63%       │ 89.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       5 │ 8,373   │ 10.03%       │ 99.64%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       2 │ 269     │ 0.32%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       1 │ 35      │ 0.04%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '3' (49,235 times)
• Least frequent: '1' (35 times)
• Minimum value: 1
• Maximum value: 5
• Low cardinality (good for categorical analysis)

analyze_dataframe(train_data[property_data])

==================================================
DATAFRAME ANALYSIS
==================================================
Shape: (83483, 27)
Data types:
int64      26
float64     1
Name: count, dtype: int64

--- NUMERIC COLUMNS (27) ---

column_analysis(train_data[property_data])

==========================================================================================
 DETAILED COLUMN ANALYSIS - ALL COLUMNS
==========================================================================================

 COLUMN: year_built
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1977 │ 1,687   │ 2.02%        │ 2.02%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1978 │ 1,674   │ 2.01%        │ 4.03%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1968 │ 1,611   │ 1.93%        │ 5.96%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1990 │ 1,510   │ 1.81%        │ 7.76%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1967 │ 1,475   │ 1.77%        │ 9.53%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1989 │ 1,416   │ 1.70%        │ 11.23%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1962 │ 1,385   │ 1.66%        │ 12.89%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1987 │ 1,385   │ 1.66%        │ 14.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    1979 │ 1,358   │ 1.63%        │ 16.17%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1988 │ 1,233   │ 1.48%        │ 17.65%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 125
• Missing values: 0 (0.00%)
• Most frequent: '1977' (1,687 times)
• Least frequent: '1988' (1,233 times)
• Minimum value: 1900
• Maximum value: 2024

 COLUMN: year_reno
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,476  │ 99.99%       │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    2022 │ 2       │ 0.00%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    2024 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    2017 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    2020 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    2009 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    2023 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 7
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,476 times)
• Least frequent: '2023' (1 times)
• Minimum value: 0
• Maximum value: 2024
• Low cardinality (good for categorical analysis)

 COLUMN: sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │   Count │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1800 │     546 │ 0.65%        │ 0.65%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1300 │     534 │ 0.64%        │ 1.29%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1200 │     488 │ 0.58%        │ 1.88%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1660 │     477 │ 0.57%        │ 2.45%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1440 │     476 │ 0.57%        │ 3.02%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1700 │     475 │ 0.57%        │ 3.59%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    2020 │     473 │ 0.57%        │ 4.16%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1900 │     471 │ 0.56%        │ 4.72%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    2000 │     471 │ 0.56%        │ 5.28%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1820 │     469 │ 0.56%        │ 5.85%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 1,718
• Missing values: 0 (0.00%)
• Most frequent: '1800' (546 times)
• Least frequent: '1820' (469 times)
• Minimum value: 200
• Maximum value: 13310

 COLUMN: sqft_lot
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    5000 │ 1,249   │ 1.50%        │ 1.50%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    6000 │ 1,059   │ 1.27%        │ 2.76%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    7200 │ 918     │ 1.10%        │ 3.86%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    4000 │ 838     │ 1.00%        │ 4.87%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    7500 │ 497     │ 0.60%        │ 5.46%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    9600 │ 474     │ 0.57%        │ 6.03%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    8400 │ 472     │ 0.57%        │ 6.60%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    4800 │ 414     │ 0.50%        │ 7.09%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    4500 │ 318     │ 0.38%        │ 7.47%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    9000 │ 306     │ 0.37%        │ 7.84%          │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 19,966
• Missing values: 0 (0.00%)
• Most frequent: '5000' (1,249 times)
• Least frequent: '9000' (306 times)
• Minimum value: 381
• Maximum value: 2076940

 COLUMN: sqft_fbsmt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 48,555  │ 58.16%       │ 58.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     500 │ 869     │ 1.04%        │ 59.20%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     600 │ 800     │ 0.96%        │ 60.16%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     400 │ 779     │ 0.93%        │ 61.09%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     700 │ 689     │ 0.83%        │ 61.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     800 │ 673     │ 0.81%        │ 62.73%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1000 │ 599     │ 0.72%        │ 63.44%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     300 │ 571     │ 0.68%        │ 64.13%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     900 │ 563     │ 0.67%        │ 64.80%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     480 │ 450     │ 0.54%        │ 65.34%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 499
• Missing values: 0 (0.00%)
• Most frequent: '0' (48,555 times)
• Least frequent: '480' (450 times)
• Minimum value: 0
• Maximum value: 5110

 COLUMN: sqft_1
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │    1010 │ 1,117   │ 1.34%        │ 1.34%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │    1200 │ 1,082   │ 1.30%        │ 2.63%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │    1300 │ 1,056   │ 1.26%        │ 3.90%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │    1250 │ 1,042   │ 1.25%        │ 5.15%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │    1080 │ 1,002   │ 1.20%        │ 6.35%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │    1060 │ 992     │ 1.19%        │ 7.54%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │    1090 │ 951     │ 1.14%        │ 8.67%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │    1040 │ 915     │ 1.10%        │ 9.77%          │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │    1120 │ 912     │ 1.09%        │ 10.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │    1180 │ 904     │ 1.08%        │ 11.95%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 1,226
• Missing values: 0 (0.00%)
• Most frequent: '1010' (1,117 times)
• Least frequent: '1180' (904 times)
• Minimum value: 80
• Maximum value: 7760

 COLUMN: stories
======================================================================
Data Type: float64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │     1   │ 40,955  │ 49.06%       │ 49.06%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     2   │ 33,146  │ 39.70%       │ 88.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     1.5 │ 6,701   │ 8.03%        │ 96.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     3   │ 2,156   │ 2.58%        │ 99.37%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     2.5 │ 462     │ 0.55%        │ 99.92%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     4   │ 45      │ 0.05%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     3.5 │ 17      │ 0.02%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     4.5 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '1.0' (40,955 times)
• Least frequent: '4.5' (1 times)
• Minimum value: 1.0
• Maximum value: 4.5
• Low cardinality (good for categorical analysis)

 COLUMN: beds
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       3 │ 37,521  │ 44.94%       │ 44.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       4 │ 28,395  │ 34.01%       │ 78.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 9,122   │ 10.93%       │ 89.88%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       5 │ 6,869   │ 8.23%        │ 98.11%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       6 │ 854     │ 1.02%        │ 99.14%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       1 │ 548     │ 0.66%        │ 99.79%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       7 │ 96      │ 0.11%        │ 99.91%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       8 │ 36      │ 0.04%        │ 99.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │       0 │ 27      │ 0.03%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │       9 │ 7       │ 0.01%        │ 99.99%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 13
• Missing values: 0 (0.00%)
• Most frequent: '3' (37,521 times)
• Least frequent: '9' (7 times)
• Minimum value: 0
• Maximum value: 13

 COLUMN: bath_full
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       1 │ 43,834  │ 52.51%       │ 52.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 33,585  │ 40.23%       │ 92.74%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 4,743   │ 5.68%        │ 98.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       0 │ 819     │ 0.98%        │ 99.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 471     │ 0.56%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 28      │ 0.03%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       6 │ 2       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       9 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '1' (43,834 times)
• Least frequent: '9' (1 times)
• Minimum value: 0
• Maximum value: 9
• Low cardinality (good for categorical analysis)

 COLUMN: bath_3qtr
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 45,360  │ 54.33%       │ 54.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 32,143  │ 38.50%       │ 92.84%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 5,655   │ 6.77%        │ 99.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 294     │ 0.35%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 24      │ 0.03%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       5 │ 7       │ 0.01%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 6
• Missing values: 0 (0.00%)
• Most frequent: '0' (45,360 times)
• Least frequent: '5' (7 times)
• Minimum value: 0
• Maximum value: 5
• Low cardinality (good for categorical analysis)

 COLUMN: bath_half
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 44,455  │ 53.25%       │ 53.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 38,151  │ 45.70%       │ 98.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 847     │ 1.01%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 25      │ 0.03%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       5 │ 3       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       4 │ 1       │ 0.00%        │ 100.00%        │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │      12 │ 1       │ 0.00%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 7
• Missing values: 0 (0.00%)
• Most frequent: '0' (44,455 times)
• Least frequent: '12' (1 times)
• Minimum value: 0
• Maximum value: 12
• Low cardinality (good for categorical analysis)

 COLUMN: garb_sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 64,516  │ 77.28%       │ 77.28%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     200 │ 1,107   │ 1.33%        │ 78.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     240 │ 1,039   │ 1.24%        │ 79.85%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     290 │ 778     │ 0.93%        │ 80.78%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     220 │ 742     │ 0.89%        │ 81.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     260 │ 728     │ 0.87%        │ 82.54%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     480 │ 646     │ 0.77%        │ 83.32%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     310 │ 554     │ 0.66%        │ 83.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     300 │ 499     │ 0.60%        │ 84.58%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     180 │ 491     │ 0.59%        │ 85.17%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 248
• Missing values: 0 (0.00%)
• Most frequent: '0' (64,516 times)
• Least frequent: '180' (491 times)
• Minimum value: 0
• Maximum value: 4000

 COLUMN: gara_sqft
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 37,014  │ 44.34%       │ 44.34%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │     440 │ 3,486   │ 4.18%        │ 48.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │     480 │ 3,034   │ 3.63%        │ 52.15%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │     400 │ 2,240   │ 2.68%        │ 54.83%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │     460 │ 2,239   │ 2.68%        │ 57.51%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │     420 │ 2,142   │ 2.57%        │ 60.08%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │     530 │ 1,383   │ 1.66%        │ 61.73%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │     500 │ 1,324   │ 1.59%        │ 63.32%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #9     │     510 │ 900     │ 1.08%        │ 64.40%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #10    │     550 │ 882     │ 1.06%        │ 65.46%         │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 522
• Missing values: 0 (0.00%)
• Most frequent: '0' (37,014 times)
• Least frequent: '550' (882 times)
• Minimum value: 0
• Maximum value: 4404

 COLUMN: wfnt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,598  │ 98.94%       │ 98.94%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       8 │ 325     │ 0.39%        │ 99.33%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       9 │ 187     │ 0.22%        │ 99.55%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 173     │ 0.21%        │ 99.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       6 │ 124     │ 0.15%        │ 99.91%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #6     │       7 │ 62      │ 0.07%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #7     │       1 │ 8       │ 0.01%        │ 99.99%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #8     │       5 │ 6       │ 0.01%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 8
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,598 times)
• Least frequent: '5' (6 times)
• Minimum value: 0
• Maximum value: 9
• Low cardinality (good for categorical analysis)

 COLUMN: golf
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,062  │ 99.50%       │ 99.50%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 421     │ 0.50%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,062 times)
• Least frequent: '1' (421 times)
• Minimum value: 0
• Maximum value: 1
• Low cardinality (good for categorical analysis)

 COLUMN: greenbelt
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,087  │ 97.13%       │ 97.13%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 2,396   │ 2.87%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 2
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,087 times)
• Least frequent: '1' (2,396 times)
• Minimum value: 0
• Maximum value: 1
• Low cardinality (good for categorical analysis)

 COLUMN: noise_traffic
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 72,472  │ 86.81%       │ 86.81%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 6,704   │ 8.03%        │ 94.84%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 3,690   │ 4.42%        │ 99.26%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 617     │ 0.74%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (72,472 times)
• Least frequent: '3' (617 times)
• Minimum value: 0
• Maximum value: 3
• Low cardinality (good for categorical analysis)

 COLUMN: view_rainier
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,905  │ 99.31%       │ 99.31%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 300     │ 0.36%        │ 99.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 241     │ 0.29%        │ 99.96%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 37      │ 0.04%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,905 times)
• Least frequent: '4' (37 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_olympics
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,768  │ 97.95%       │ 97.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 1,038   │ 1.24%        │ 99.19%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 460     │ 0.55%        │ 99.74%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 217     │ 0.26%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,768 times)
• Least frequent: '4' (217 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_cascades
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,535  │ 97.67%       │ 97.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 1,324   │ 1.59%        │ 99.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 508     │ 0.61%        │ 99.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 116     │ 0.14%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,535 times)
• Least frequent: '4' (116 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_territorial
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 76,605  │ 91.76%       │ 91.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 4,335   │ 5.19%        │ 96.95%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 1,867   │ 2.24%        │ 99.19%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 676     │ 0.81%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (76,605 times)
• Least frequent: '4' (676 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_skyline
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,962  │ 99.38%       │ 99.38%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 319     │ 0.38%        │ 99.76%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 121     │ 0.14%        │ 99.90%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 81      │ 0.10%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,962 times)
• Least frequent: '4' (81 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_sound
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,420  │ 97.53%       │ 97.53%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 751     │ 0.90%        │ 98.43%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 573     │ 0.69%        │ 99.11%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 464     │ 0.56%        │ 99.67%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 275     │ 0.33%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,420 times)
• Least frequent: '4' (275 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_lakewash
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 81,665  │ 97.82%       │ 97.82%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       1 │ 723     │ 0.87%        │ 98.69%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       2 │ 547     │ 0.66%        │ 99.34%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 352     │ 0.42%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 196     │ 0.23%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (81,665 times)
• Least frequent: '4' (196 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_lakesamm
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,999  │ 99.42%       │ 99.42%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 151     │ 0.18%        │ 99.60%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       1 │ 143     │ 0.17%        │ 99.77%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       3 │ 105     │ 0.13%        │ 99.90%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #5     │       4 │ 85      │ 0.10%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 5
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,999 times)
• Least frequent: '4' (85 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_otherwater
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 82,858  │ 99.25%       │ 99.25%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 302     │ 0.36%        │ 99.61%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 167     │ 0.20%        │ 99.81%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 156     │ 0.19%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (82,858 times)
• Least frequent: '4' (156 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

 COLUMN: view_other
======================================================================
Data Type: int64
Total Rows: 83,483 | Non-Missing: 83,483 | Missing: 0

 VALUE DISTRIBUTION:
--------------------------------------------------
╒════════╤═════════╤═════════╤══════════════╤════════════════╕
│ Rank   │   Value │ Count   │ Percentage   │ Cumulative %   │
╞════════╪═════════╪═════════╪══════════════╪════════════════╡
│ #1     │       0 │ 83,044  │ 99.47%       │ 99.47%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #2     │       2 │ 324     │ 0.39%        │ 99.86%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #3     │       3 │ 95      │ 0.11%        │ 99.98%         │
├────────┼─────────┼─────────┼──────────────┼────────────────┤
│ #4     │       4 │ 20      │ 0.02%        │ 100.00%        │
╘════════╧═════════╧═════════╧══════════════╧════════════════╛

 STATISTICS:
------------------------------
• Unique values: 4
• Missing values: 0 (0.00%)
• Most frequent: '0' (83,044 times)
• Least frequent: '4' (20 times)
• Minimum value: 0
• Maximum value: 4
• Low cardinality (good for categorical analysis)

### Run Outlier Analysis on the numerical sqft features
calculate_outlier_table(train_data, ['sqft', 'sqft_1', 'sqft_fbsmt', 'gara_sqft', 'garb_sqft', 'sqft_lot'])

### Examine sqft = 0 along with sqft_1 
sqft_var = ['sqft', 'sqft_1', 'sqft_fbsmt', 'gara_sqft', 'garb_sqft', 'sqft_lot']


for var in sqft_var:
    print(f'{var} at 1% percentile = {train_data[var].quantile(.01)}')
    print(f'{var} at 5% percentile = {train_data[var].quantile(.01)}')
    print(f'{var} at 95% percentile = {train_data[var].quantile(.95)}')
    print(f'{var} at 99% percentile = {train_data[var].quantile(.99)}')
    print("")

sqft at 1% percentile = 760.0
sqft at 5% percentile = 760.0
sqft at 95% percentile = 3670.0
sqft at 99% percentile = 4640.0

sqft_1 at 1% percentile = 400.0
sqft_1 at 5% percentile = 400.0
sqft_1 at 95% percentile = 2050.0
sqft_1 at 99% percentile = 2680.0

sqft_fbsmt at 1% percentile = 0.0
sqft_fbsmt at 5% percentile = 0.0
sqft_fbsmt at 95% percentile = 1200.0
sqft_fbsmt at 99% percentile = 1650.0

gara_sqft at 1% percentile = 0.0
gara_sqft at 5% percentile = 0.0
gara_sqft at 95% percentile = 750.0
gara_sqft at 99% percentile = 950.0

garb_sqft at 1% percentile = 0.0
garb_sqft at 5% percentile = 0.0
garb_sqft at 95% percentile = 520.0
garb_sqft at 99% percentile = 700.0

sqft_lot at 1% percentile = 797.0
sqft_lot at 5% percentile = 797.0
sqft_lot at 95% percentile = 39166.69999999998
sqft_lot at 99% percentile = 193439.6599999987

large_attached_garage = train_data[train_data['gara_sqft'] > 1000]

large_attached_garage.describe()

len(train_data[train_data['sqft'] >= 4900])

568

train_data[train_data['sqft'] >= 10000]

train_data[(train_data['sqft'] <= 300) & (train_data['beds'] > 0)]

#Logically sqft_1 should always be less than sqft
#Potential issues in data quality due one or more of the following: Manual input mistakes during property assessment; Different measurement standards applied at different times; Values recorded by different assessors using varying methodologies 
train_data[(train_data['sqft'] < train_data['sqft_1'])].head(10)

# Will remove the 126 affected rows due to low number of records and inability to which value (sqft or sqft_1) is incorrect
train_data = train_data.drop(train_data[(train_data['sqft'] < train_data['sqft_1'])].index)

sns.boxenplot(data=train_data[train_data['sqft_fbsmt'] > 0], x='sqft_fbsmt')
plt.show()

sns.boxenplot(data=train_data[train_data['gara_sqft'] > 0], x='gara_sqft')
plt.show()

sns.boxenplot(data=train_data[train_data['garb_sqft'] > 0], x='garb_sqft')
plt.show()

train_data[(train_data['garb_sqft'] >= 4000)]

### Convert select features to binomial ###

# sqft_fbsmt (1 if finished basement)
train_data['finished_basement_type'] = np.select([train_data['sqft_fbsmt'] == 0,
                                                  (train_data['sqft_fbsmt'] > 0) & (train_data['sqft_fbsmt'] <= 500),
                                                  (train_data['sqft_fbsmt'] > 1250) & (train_data['sqft_fbsmt'] <= 2000),
                                                  train_data['sqft_fbsmt'] > 2000],
                                                 ['None', 'Small', 'Large', 'XLarge'],
                                                 default='Medium')

# gara_sqft (1 if attached garage)
train_data['attached_garage_type'] = np.select([train_data['gara_sqft'] == 0,
                                                  (train_data['gara_sqft'] > 0) & (train_data['gara_sqft'] <= 250),
                                                  (train_data['gara_sqft'] > 750) & (train_data['gara_sqft'] <= 1000),
                                                  train_data['gara_sqft'] >= 1000],
                                                 ['None', 'Small', 'Large', 'XLarge'],
                                                 default='Medium')

# garb_sqft (1 if basement garage)
train_data['basement_garage_type'] = np.select([train_data['garb_sqft'] == 0,
                                                  (train_data['garb_sqft'] > 0) & (train_data['garb_sqft'] <= 250),
                                                  (train_data['garb_sqft'] > 750) & (train_data['garb_sqft'] <= 1000),
                                                  train_data['garb_sqft'] >= 1000],
                                                 ['None', 'Small', 'Large', 'XLarge'],
                                                 default='Medium')

# wfnt (1 if wfnt indicates some level of access)
train_data['has_waterfront_access'] = [0 if x == 0 else 1 for x in train_data.wfnt]

# noise_traffic (1 if above typical noise levels)
train_data['above_typical_noise'] = [0 if x == 0 else 1 for x in train_data.noise_traffic] 

# all view attributes (1 if view)
train_data['has_view_rainier'] = [0 if x == 0 else 1 for x in train_data.view_rainier]
train_data['has_view_olympics'] = [0 if x == 0 else 1 for x in train_data.view_olympics]
train_data['has_view_cascades'] = [0 if x == 0 else 1 for x in train_data.view_cascades]
train_data['has_view_territorial'] = [0 if x == 0 else 1 for x in train_data.view_territorial]
train_data['has_view_skyline'] = [0 if x == 0 else 1 for x in train_data.view_skyline]
train_data['has_view_sound'] = [0 if x == 0 else 1 for x in train_data.view_sound]
train_data['has_view_lakewash'] = [0 if x == 0 else 1 for x in train_data.view_lakewash]
train_data['has_view_lakesamm'] = [0 if x == 0 else 1 for x in train_data.view_lakesamm]
train_data['has_view_otherwater'] = [0 if x == 0 else 1 for x in train_data.view_otherwater]
train_data['has_view_other'] = [0 if x == 0 else 1 for x in train_data.view_other]

selected_features = [
    'id',
    
    ### sale features ###
    'sale_date',
    'adjusted_sale_price',
    'sale_nbr',
    'sale_warning_3',
    'sale_warning_4',
    'sale_warning_10',
    'sale_warning_15',
    'sale_warning_16',
    'sale_warning_17',
    'sale_warning_24',
    'sale_warning_26',
    'sale_warning_29',
    'sale_warning_30',
    'sale_warning_35',
    'sale_warning_36',
    'sale_warning_38',
    'sale_warning_40',
    'sale_warning_41',
    'sale_warning_44',
    'sale_warning_54',
    'sale_warning_57',
    'sale_warning_58',
    'sale_warning_60',
    
    ### Geographic Features ###
    'latitude',
    'longitude',
    'area',
    'city',
    
    ### Legal Features ###
    'zoning_category',
    
    ### Property Features ###
    'year_built',
    'year_reno',
    'sqft',
    'sqft_1',
    'stories',
    'beds',
    'bath_full',
    'bath_3qtr',
    'bath_half',
    'golf',
    'greenbelt',
    'submarket',
    'finished_basement_type',
    'attached_garage_type',
    'basement_garage_type',
    'has_waterfront_access',
    'above_typical_noise',
    'has_view_rainier',
    'has_view_olympics',
    'has_view_cascades',
    'has_view_territorial',
    'has_view_skyline',
    'has_view_sound',
    'has_view_lakewash',
    'has_view_lakesamm',
    'has_view_otherwater',
    'has_view_other'
]

train_data = train_data[selected_features]

# Log-transform sale_price and square footage variables (right-skewed distributions)
train_data.drop(index=train_data[train_data['sqft'] == 0].index, inplace=True)
train_data['log_adj_sale_price'] = np.log(train_data['adjusted_sale_price'])
train_data['log_sqft'] = np.log(train_data['sqft'])
train_data['log_sqft_1'] = np.log(train_data['sqft_1'])
train_data.head()

# Distance-based features: Distance to city centers, schools, transit hubs
# Neighborhood clustering: K-means clustering on lat/lon to create location groups
# Market density: Count of recent sales within radius

def create_spatial_features(df):
    """
    Create spatial features for location-based insights in house price prediction.
    
    Parameters:
    df (DataFrame): Input dataframe containing latitude, longitude, and location data
    
    Returns:
    DataFrame: Enhanced dataframe with spatial features
    """
    df_spatial = df.copy()
    
    # Reference points for distance calculations
    reference_points = {
        'seattle_downtown': (47.6062, -122.3321),
        'bellevue_downtown': (47.6101, -122.2015),
        'redmond_downtown': (47.6740, -122.1215),
        'sea_airport': (47.4502, -122.3088)
    }
    
    if 'latitude' in df_spatial.columns and 'longitude' in df_spatial.columns:
        
        # 1. Distance-based features
        print("Creating distance-based features...")
        for location, coords in reference_points.items():
            distances = []
            for idx, row in df_spatial.iterrows():
                if pd.notna(row['latitude']) and pd.notna(row['longitude']):
                    try:
                        distance = geodesic(
                            (row['latitude'], row['longitude']), 
                            coords
                        ).kilometers
                        distances.append(distance)
                    except:
                        distances.append(np.nan)
                else:
                    distances.append(np.nan)
            
            df_spatial[f'distance_to_{location}'] = distances
        
        # 2. Geographic clustering for neighborhood analysis
        print("Creating geographic clusters...")
        valid_coords = df_spatial[['latitude', 'longitude']].dropna()
        if len(valid_coords) > 50:
            cluster_configs = {
                'macro_neighborhood': min(25, len(valid_coords)//4),
                'micro_neighborhood': min(100, len(valid_coords)//2),
                'local_area': min(200, len(valid_coords))
            }
            
            for cluster_name, n_clusters in cluster_configs.items():
                if len(valid_coords) >= n_clusters and n_clusters > 1:
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                    df_spatial[f'{cluster_name}_cluster'] = -1
                    clusters = kmeans.fit_predict(valid_coords)
                    df_spatial.loc[valid_coords.index, f'{cluster_name}_cluster'] = clusters

        # 3. Geographic zones
        print("Creating geographic zones...")
        def categorize_location(lat, lon):
            if pd.isna(lat) or pd.isna(lon):
                return 'Unknown'
            
            # Seattle metro area zones
            if 47.6 <= lat <= 47.8 and -122.4 <= lon <= -122.2:
                return 'Seattle_Core'
            elif 47.5 <= lat <= 47.7 and -122.2 <= lon <= -122.0:
                return 'Eastside_Core'
            elif 47.3 <= lat <= 47.5:
                return 'South_County'
            elif lat >= 47.7:
                return 'North_County'
            else:
                return 'Other'
        
        df_spatial['geographic_zone'] = df_spatial.apply(
            lambda row: categorize_location(row['latitude'], row['longitude']), 
            axis=1
        )
    
    print(f"Created {len([col for col in df_spatial.columns if col not in df.columns])} spatial features")
    return df_spatial

# Sale seasonality: Month, quarter indicators
# Market timing: Time since last major economic event
# Property age: Age at sale, years since renovation
# Market momentum: Rolling average prices in area

def create_temporal_features(df):
    """
    Create temporal features for time-based patterns in house price prediction.
    
    Parameters:
    df (DataFrame): Input dataframe containing sale_date and property age data
    
    Returns:
    DataFrame: Enhanced dataframe with temporal features
    """
    df_temporal = df.copy()
    
    if 'sale_date' in df_temporal.columns:
        print("Processing sale date features...")
        # Ensure sale_date is datetime
        df_temporal['sale_date'] = pd.to_datetime(df_temporal['sale_date'])
        
        # 1. Basic time components
        df_temporal['sale_year'] = df_temporal['sale_date'].dt.year
        df_temporal['sale_month'] = df_temporal['sale_date'].dt.month
        df_temporal['sale_quarter'] = df_temporal['sale_date'].dt.quarter
        df_temporal['sale_day_of_year'] = df_temporal['sale_date'].dt.dayofyear
        df_temporal['sale_week_of_year'] = df_temporal['sale_date'].dt.isocalendar().week
        
        # 2. Cyclical encoding for better ML performance
        df_temporal['month_sin'] = np.sin(2 * np.pi * df_temporal['sale_month'] / 12)
        df_temporal['month_cos'] = np.cos(2 * np.pi * df_temporal['sale_month'] / 12)
        df_temporal['quarter_sin'] = np.sin(2 * np.pi * df_temporal['sale_quarter'] / 4)
        df_temporal['quarter_cos'] = np.cos(2 * np.pi * df_temporal['sale_quarter'] / 4)
        
        # 3. Seasonal indicators
        df_temporal['is_spring'] = ((df_temporal['sale_month'] >= 3) & 
                                   (df_temporal['sale_month'] <= 5)).astype(int)
        df_temporal['is_summer'] = ((df_temporal['sale_month'] >= 6) & 
                                   (df_temporal['sale_month'] <= 8)).astype(int)
        df_temporal['is_fall'] = ((df_temporal['sale_month'] >= 9) & 
                                 (df_temporal['sale_month'] <= 11)).astype(int)
        df_temporal['is_winter'] = ((df_temporal['sale_month'] == 12) | 
                                   (df_temporal['sale_month'] <= 2)).astype(int)
        
        # 4. Market timing features
        print("Creating market timing features...")
        market_events = {
            '2000-03-10': 'dot_com_peak',
            '2008-09-15': 'financial_crisis',
            '2020-03-15': 'covid_start'
        }
        
        for event_date, event_name in market_events.items():
            event_datetime = pd.to_datetime(event_date)
            days_since = (df_temporal['sale_date'] - event_datetime).dt.days
            df_temporal[f'days_since_{event_name}'] = days_since
            df_temporal[f'months_since_{event_name}'] = days_since / 30.44
        
        # 5. Market cycle indicators
        df_temporal['is_pre_2008_crisis'] = (df_temporal['sale_date'] < '2008-01-01').astype(int)
        df_temporal['is_post_2008_recovery'] = ((df_temporal['sale_date'] >= '2012-01-01') & 
                                               (df_temporal['sale_date'] < '2020-01-01')).astype(int)
        df_temporal['is_covid_era'] = (df_temporal['sale_date'] >= '2020-01-01').astype(int)
    
    # 6. Property age features
    if 'year_built' in df_temporal.columns and 'sale_date' in df_temporal.columns:
        print("Creating property age features...")
        # Negative values of property_age_at_sale represent land sales where construction occured after sale
        df_temporal['property_age_at_sale'] = df_temporal['sale_year'] - df_temporal['year_built'] 
        df_temporal['is_land_sale_post_construction'] = (df_temporal['property_age_at_sale'] <= 0).astype(int)
        df_temporal['is_vintage'] = (df_temporal['property_age_at_sale'] >= 50).astype(int)
    
    # 7. Renovation timing
    if 'year_reno' in df_temporal.columns and 'sale_date' in df_temporal.columns:
        print("Creating renovation features...")
        df_temporal['years_since_renovation'] = df_temporal['sale_year'] - df_temporal['year_reno']
        df_temporal['has_been_renovated'] = (df_temporal['year_reno'] > df_temporal['year_built']).astype(int)
    
    print(f"Created {len([col for col in df_temporal.columns if col not in df.columns])} temporal features")
    return df_temporal

# Composite scores: Price per square foot, bathroom-to-bedroom ratio
# Quality interactions: Grade × condition interactions
# Size categories: Binned square footage categories
# Luxury indicators: Combination of high-end features

def create_property_features(df):
    """
    Create enhanced property characteristic features for house price prediction.
    
    Parameters:
    df (DataFrame): Input dataframe containing property characteristics
    
    Returns:
    DataFrame: Enhanced dataframe with property features
    """
    df_property = df.copy()
    
    print("Creating property characteristic features...")
    
    # 1. Size and ratio features
    if 'sqft' in df_property.columns and 'sqft_lot' in df_property.columns:
        df_property['house_to_lot_ratio'] = (df_property['sqft'] / 
                                            df_property['sqft_lot'].replace(0, np.nan))
        
        # Lot size categories
        df_property['lot_size_category'] = pd.cut(
            df_property['sqft_lot'], 
            bins=[0, 5000, 10000, 20000, np.inf],
            labels=['Small', 'Medium', 'Large', 'XLarge']
        )
    
    # 2. Room efficiency and ratios
    if 'beds' in df_property.columns and 'sqft' in df_property.columns:
        df_property['sqft_per_bedroom'] = (df_property['sqft'] / 
                                          df_property['beds'].replace(0, np.nan))
    
    # 3. Bathroom calculations
    bath_columns = ['bath_full', 'bath_3qtr', 'bath_half']
    available_bath_cols = [col for col in bath_columns if col in df_property.columns]
    
    if available_bath_cols:
        df_property['total_bathrooms'] = 0
        weights = {'bath_full': 1.0, 'bath_3qtr': 0.75, 'bath_half': 0.5}
        
        for col in available_bath_cols:
            df_property['total_bathrooms'] += (
                df_property[col].fillna(0) * weights.get(col, 1.0)
            )
        
        if 'beds' in df_property.columns:
            df_property['bathroom_bedroom_ratio'] = (
                df_property['total_bathrooms'] / 
                df_property['beds'].replace(0, np.nan)
            )
    
    # 4. Property type categorization
    # Multi-story classification
    if 'stories' in df_property.columns:
        df_property['is_single_story'] = (df_property['stories'] == 1.0).astype(int)
        df_property['is_multi_story'] = (df_property['stories'] > 1.0).astype(int)
    
    print(f"Created {len([col for col in df_property.columns if col not in df.columns])} property features")
    return df_property

# Comparative market analysis: Recent sales in same subdivision
# Supply indicators: Inventory levels by area and price range
# Economic indicators: Interest rates, local employment data (if available)

def create_market_context_features(df):
    """
    Create market context and comparative features for house price prediction.
    
    Parameters:
    df (DataFrame): Input dataframe containing sale data and location information
    
    Returns:
    DataFrame: Enhanced dataframe with market context features
    """
    df_market = df.copy()
    
    print("Creating market context features...")
    
    # 1. Time-based market features
    if 'sale_date' in df_market.columns and 'adjusted_sale_price' in df_market.columns:
        print("Processing temporal market patterns...")
        
        # Ensure proper datetime format
        df_market['sale_date'] = pd.to_datetime(df_market['sale_date'])
        df_market_sorted = df_market.sort_values('sale_date').copy()
        
        # Rolling market indicators (6-month windows)
        window_size = min(180, len(df_market_sorted) // 4)  # Adaptive window size
        
        if window_size >= 10:
            df_market_sorted['rolling_median_price_6m'] = (
                df_market_sorted['adjusted_sale_price']
                .rolling(window=window_size, min_periods=10)
                .median()
            )
            df_market_sorted['rolling_std_price_6m'] = (
                df_market_sorted['adjusted_sale_price']
                .rolling(window=window_size, min_periods=10)
                .std()
            )
            
            # Price volatility (coefficient of variation)
            df_market_sorted['market_volatility_6m'] = (
                df_market_sorted['rolling_std_price_6m'] / 
                df_market_sorted['rolling_median_price_6m']
            )
        
        # Restore original order
        df_market = df_market_sorted.sort_index()

        # Year-over-year market features
        if 'sale_year' in df_market.columns:
            yearly_stats = df_market.groupby('sale_year')['adjusted_sale_price'].agg([
                'median', 'mean', 'std', 'count'
            ]).reset_index()
            yearly_stats.columns = ['sale_year', 'yearly_median_price', 'yearly_mean_price', 
                                   'yearly_price_std', 'yearly_sale_count']
            
            df_market = df_market.merge(yearly_stats, on='sale_year', how='left')
            
            # Market activity indicators
            df_market['relative_to_yearly_median'] = (
                df_market['adjusted_sale_price'] / df_market['yearly_median_price']
            )
            df_market['is_high_activity_year'] = (
                df_market['yearly_sale_count'] > 
                df_market['yearly_sale_count'].quantile(0.75)
            ).astype(int)

    
    # 2. Price trend features
    if all(col in df_market.columns for col in ['sale_date', 'adjusted_sale_price']):
        print("Creating price momentum indicators...")
        
        df_market_sorted = df_market.sort_values('sale_date').copy()
        
        # Price momentum (30-day and 90-day trends)
        for window in [30, 90]:
            window_size = min(window, len(df_market_sorted) // 10)
            if window_size >= 5:
                df_market_sorted[f'price_trend_{window}d'] = (
                    df_market_sorted['adjusted_sale_price']
                    .rolling(window=window_size, min_periods=5)
                    .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0] 
                          if len(x) >= 5 else np.nan, raw=False)
                )
        
        # Market momentum indicators
        if 'price_trend_30d' in df_market_sorted.columns:
            df_market_sorted['is_rising_market_30d'] = (
                df_market_sorted['price_trend_30d'] > 0
            ).astype(int)
            df_market_sorted['is_falling_market_30d'] = (
                df_market_sorted['price_trend_30d'] < 0
            ).astype(int)
        
        # Restore original order
        df_market = df_market_sorted.sort_index()
    
    # 3. Supply and demand indicators
    print("Creating supply/demand proxies...")
    
    # Monthly sales volume by area
    if all(col in df_market.columns for col in ['sale_date', 'city']):
        df_market['year_month'] = df_market['sale_date'].dt.to_period('M')
        
        monthly_volume = df_market.groupby(['city', 'year_month']).size().reset_index(name='monthly_volume')
        monthly_volume['year_month'] = monthly_volume['year_month'].astype(str)
        df_market['year_month'] = df_market['year_month'].astype(str)
        
        df_market = df_market.merge(monthly_volume, on=['city', 'year_month'], how='left')
        
        # Market heat indicator
        df_market['market_heat'] = pd.cut(
            df_market['monthly_volume'].fillna(0),
            bins=[0, 5, 15, 30, np.inf],
            labels=['Cold', 'Warm', 'Hot', 'Very_Hot']
        )
    
    # Clean up temporary columns
    columns_to_drop = ['year_month'] if 'year_month' in df_market.columns else []
    if columns_to_drop:
        df_market = df_market.drop(columns=columns_to_drop)
    
    print(f"Created {len([col for col in df_market.columns if col not in df.columns])} market context features")
    return df_market

df_enhanced = create_spatial_features(train_data)
df_enhanced = create_temporal_features(df_enhanced)
df_enhanced = create_property_features(df_enhanced)
df_enhanced = create_market_context_features(df_enhanced)
analyze_dataframe(df_enhanced)

Creating distance-based features...
Creating geographic clusters...
Creating geographic zones...
Created 8 spatial features
Processing sale date features...
Creating market timing features...
Creating property age features...
Creating renovation features...
Created 27 temporal features
Creating property characteristic features...
Created 5 property features
Creating market context features...
Processing temporal market patterns...
Creating price momentum indicators...
Creating supply/demand proxies...
Created 15 market context features
==================================================
DATAFRAME ANALYSIS
==================================================
Shape: (83431, 114)
Data types:
int64             70
float64           29
object             8
int32              4
datetime64[ns]     1
UInt32             1
category           1
Name: count, dtype: int64

--- NUMERIC COLUMNS (104) ---
sqft_per_bedroom:
  - Infinite values: 0
  - NaN values: 27
  - Extremely large values: 0
bathroom_bedroom_ratio:
  - Infinite values: 0
  - NaN values: 27
  - Extremely large values: 0
rolling_median_price_6m:
  - Infinite values: 0
  - NaN values: 9
  - Extremely large values: 0
rolling_std_price_6m:
  - Infinite values: 0
  - NaN values: 9
  - Extremely large values: 0
market_volatility_6m:
  - Infinite values: 0
  - NaN values: 9
  - Extremely large values: 0
price_trend_30d:
  - Infinite values: 0
  - NaN values: 4
  - Extremely large values: 0
price_trend_90d:
  - Infinite values: 0
  - NaN values: 4
  - Extremely large values: 0

--- NON-NUMERIC COLUMNS (10) ---
sale_date: datetime64[ns], 313 unique values, 0 (0.0) missing
sale_nbr: object, 3 unique values, 0 (0.0) missing
city: object, 40 unique values, 0 (0.0) missing
zoning_category: object, 6 unique values, 0 (0.0) missing
submarket: object, 20 unique values, 0 (0.0) missing
finished_basement_type: object, 5 unique values, 0 (0.0) missing
attached_garage_type: object, 5 unique values, 0 (0.0) missing
basement_garage_type: object, 5 unique values, 0 (0.0) missing
geographic_zone: object, 5 unique values, 0 (0.0) missing
market_heat: category, 4 unique values, 0 (0.0) missing

df_enhanced.to_csv('train_full_features.csv')

X = df_enhanced.drop(['adjusted_sale_price', 'log_adj_sale_price', 'id'], axis=1)
y = df_enhanced['adjusted_sale_price']
#y = df_enhanced['log_adj_sale_price']

numerical_features = []

categorical_features = []

datatime_features = []

for col in X.columns:
    if (X[col].dtype == 'int64') or (X[col].dtype == 'float64'):
        numerical_features.append(col)
    elif (X[col].dtype == 'object') or (X[col].dtype == 'category'):
        categorical_features.append(col)
    else:
        datatime_features.append(col)

# Define transformations
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=34)

X_train_transformed = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()

X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)

X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)

selector = SelectKBest(score_func=f_regression, k=50)
X_train_transformed_small = selector.fit_transform(X_train_transformed, y_train)
X_test_transformed_small = selector.transform(X_test_transformed)
X_train_transformed_small.shape

(62573, 50)

X_train_transformed_small

array([[-0.11786295,  0.66478132, -0.30115317, ...,  0.        ,
         0.        ,  0.        ],
       [-0.20352627,  0.16128939,  2.13725661, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.40805855,  0.29008965,  0.73132665, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.58137264, -0.22511139, -0.45492676, ...,  0.        ,
         0.        ,  0.        ],
       [-0.35256653, -0.56467572,  0.77526196, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.77986083, -0.68176686, -0.45492676, ...,  0.        ,
         0.        ,  0.        ]], shape=(62573, 50))

selector.get_feature_names_out()

array(['num__latitude', 'num__sqft', 'num__sqft_1', 'num__stories',
       'num__beds', 'num__bath_full', 'num__bath_3qtr', 'num__bath_half',
       'num__has_view_territorial', 'num__has_view_lakewash',
       'num__log_sqft', 'num__log_sqft_1',
       'num__distance_to_seattle_downtown',
       'num__distance_to_bellevue_downtown',
       'num__distance_to_redmond_downtown',
       'num__distance_to_sea_airport', 'num__days_since_dot_com_peak',
       'num__months_since_dot_com_peak',
       'num__days_since_financial_crisis',
       'num__months_since_financial_crisis',
       'num__days_since_covid_start', 'num__months_since_covid_start',
       'num__is_pre_2008_crisis', 'num__sqft_per_bedroom',
       'num__total_bathrooms', 'num__bathroom_bedroom_ratio',
       'num__is_single_story', 'num__is_multi_story',
       'num__rolling_median_price_6m', 'num__rolling_std_price_6m',
       'num__yearly_median_price', 'num__yearly_mean_price',
       'num__yearly_price_std', 'num__yearly_sale_count',
       'num__relative_to_yearly_median', 'num__is_high_activity_year',
       'num__price_trend_30d', 'num__price_trend_90d',
       'num__is_rising_market_30d', 'num__is_falling_market_30d',
       'cat__city_MERCER ISLAND', 'cat__submarket_D', 'cat__submarket_I',
       'cat__submarket_O', 'cat__submarket_R', 'cat__submarket_S',
       'cat__finished_basement_type_Large',
       'cat__attached_garage_type_Large',
       'cat__geographic_zone_Eastside_Core',
       'cat__geographic_zone_South_County'], dtype=object)

from sklearn.model_selection import GridSearchCV

# Define the model
qrf = RandomForestQuantileRegressor(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=qrf,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=90
)

# Fit the model
grid_search.fit(X_train_transformed_small, y_train)

# Best model
best_qrf = grid_search.best_estimator_

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START max_depth=None, n_estimators=100............................
[CV 1/3; 1/6] END max_depth=None, n_estimators=100;, score=-2085.218 total time= 1.7min
[CV 2/3; 1/6] START max_depth=None, n_estimators=100............................
[CV 2/3; 1/6] END max_depth=None, n_estimators=100;, score=-2184.625 total time= 1.7min
[CV 3/3; 1/6] START max_depth=None, n_estimators=100............................
[CV 3/3; 1/6] END max_depth=None, n_estimators=100;, score=-2144.231 total time= 1.6min
[CV 1/3; 2/6] START max_depth=None, n_estimators=200............................
[CV 1/3; 2/6] END max_depth=None, n_estimators=200;, score=-2050.715 total time= 3.3min
[CV 2/3; 2/6] START max_depth=None, n_estimators=200............................
[CV 2/3; 2/6] END max_depth=None, n_estimators=200;, score=-2146.086 total time= 3.3min
[CV 3/3; 2/6] START max_depth=None, n_estimators=200............................
[CV 3/3; 2/6] END max_depth=None, n_estimators=200;, score=-2154.586 total time= 3.2min
[CV 1/3; 3/6] START max_depth=10, n_estimators=100..............................
[CV 1/3; 3/6] END max_depth=10, n_estimators=100;, score=-5267.360 total time= 1.1min
[CV 2/3; 3/6] START max_depth=10, n_estimators=100..............................
[CV 2/3; 3/6] END max_depth=10, n_estimators=100;, score=-5422.035 total time= 1.1min
[CV 3/3; 3/6] START max_depth=10, n_estimators=100..............................
[CV 3/3; 3/6] END max_depth=10, n_estimators=100;, score=-5547.786 total time= 1.1min
[CV 1/3; 4/6] START max_depth=10, n_estimators=200..............................
[CV 1/3; 4/6] END max_depth=10, n_estimators=200;, score=-5129.712 total time= 2.3min
[CV 2/3; 4/6] START max_depth=10, n_estimators=200..............................
[CV 2/3; 4/6] END max_depth=10, n_estimators=200;, score=-5256.372 total time= 2.0min
[CV 3/3; 4/6] START max_depth=10, n_estimators=200..............................
[CV 3/3; 4/6] END max_depth=10, n_estimators=200;, score=-5425.331 total time= 2.1min
[CV 1/3; 5/6] START max_depth=20, n_estimators=100..............................
[CV 1/3; 5/6] END max_depth=20, n_estimators=100;, score=-2087.715 total time= 1.7min
[CV 2/3; 5/6] START max_depth=20, n_estimators=100..............................
[CV 2/3; 5/6] END max_depth=20, n_estimators=100;, score=-2144.319 total time= 1.7min
[CV 3/3; 5/6] START max_depth=20, n_estimators=100..............................
[CV 3/3; 5/6] END max_depth=20, n_estimators=100;, score=-2172.198 total time= 1.7min
[CV 1/3; 6/6] START max_depth=20, n_estimators=200..............................
[CV 1/3; 6/6] END max_depth=20, n_estimators=200;, score=-2069.367 total time= 3.3min
[CV 2/3; 6/6] START max_depth=20, n_estimators=200..............................
[CV 2/3; 6/6] END max_depth=20, n_estimators=200;, score=-2131.047 total time= 3.3min
[CV 3/3; 6/6] START max_depth=20, n_estimators=200..............................
[CV 3/3; 6/6] END max_depth=20, n_estimators=200;, score=-2163.186 total time= 3.3min

# Prediction with multiple quantiles
quantiles = [0.05, 0.5, 0.95]  # 5th, 50th, 95th percentiles
predictions = best_qrf.predict(X_train_transformed_small, quantiles=quantiles)

print(predictions[:10])

[[1268545.   1273353.   1273353.  ]
 [ 716555.    718299.    719171.  ]
 [ 632647.    634854.    634854.  ]
 [ 676359.    676359.    676359.  ]
 [ 674081.    674081.    674081.  ]
 [ 696795.    696795.    696795.  ]
 [ 303721.    309510.    311631.55]
 [ 664732.    666779.    672411.15]
 [ 509330.    509330.    509330.  ]
 [1459969.   1460076.   1460076.  ]]

def plot_predictions_on_king_county(model, X_data, y_actual, original_data, 
                                   dataset_type='test', quantiles=[0.05, 0.95],
                                   figsize=(15, 12), point_size=15, alpha=0.7):
    """
    Plot house price predictions on King County boundary map with color-coded accuracy.
    
    """
    
    print("Downloading King County boundary from OpenStreetMap...")
    
    # Create figure
    fig, ax = plt.subplots(figsize=figsize)
    
    king_county = ox.geocode_to_gdf("King County, Washington, USA")
    # Plot boundary
    king_county.plot(ax=ax, color='lightgray', edgecolor='black', alpha=0.3, linewidth=2)
    
    print(f"Making predictions for {len(X_data)} properties...")
    
    # Make predictions
    predictions = model.predict(X_data, quantiles=quantiles)
    lower_bound = predictions[:, 0]
    upper_bound = predictions[:, 1]
    median_pred = np.ravel(model.predict(X_data, quantiles=[0.5]))
    
    # Determine accuracy categories
    within_interval = (y_actual >= lower_bound) & (y_actual <= upper_bound)
    
    # Create color categories
    colors = []
    categories = []
    
    within_interval = np.array(within_interval)
    
    for i in range(len(y_actual)):
        actual = y_actual.iloc[i] if isinstance(y_actual, pd.Series) else y_actual[i]
        
        if within_interval[i]:
            # Check how well-centered the prediction is
            interval_width = upper_bound[i] - lower_bound[i]
            if interval_width > 0:
                distance_from_center = abs(actual - median_pred[i])
                relative_position = distance_from_center / (interval_width / 2)
                
                if relative_position <= 0.3:  # Very close to center
                    colors.append('green')
                    categories.append('Excellent')
                else:  # Within interval but not perfectly centered
                    colors.append('gold')
                    categories.append('Good')
            else:
                colors.append('green')
                categories.append('Excellent')
        else:
            colors.append('red')
            categories.append('Poor')
    
    # Get coordinates
    if len(original_data) >= len(X_data):
        lats = original_data['latitude'].iloc[:len(X_data)].values
        lons = original_data['longitude'].iloc[:len(X_data)].values
    else:
        lats = original_data['latitude'].values
        lons = original_data['longitude'].values
    
    # Plot points by category
    for category, color in [('Excellent', 'green'), ('Good', 'gold'), ('Poor', 'red')]:
        mask = [c == category for c in categories]
        if sum(mask) > 0:
            mask_lats = [lats[i] for i, m in enumerate(mask) if m]
            mask_lons = [lons[i] for i, m in enumerate(mask) if m]
            ax.scatter(mask_lons, mask_lats, c=color, s=point_size, alpha=alpha,
                      label=f'{category} ({sum(mask)})',
                      edgecolors='black', linewidth=0.3, zorder=5)
    
    # Customize plot
    ax.set_xlabel('Longitude', fontsize=12, fontweight='bold')
    ax.set_ylabel('Latitude', fontsize=12, fontweight='bold')
    ax.set_title(f'King County House Price Predictions - {dataset_type.title()} Set\n'
                f'{round((quantiles[1] - quantiles[0]) * 100, 2)}% Confidence Intervals',
                fontsize=14, fontweight='bold', pad=20)
    
    # Add legend
    ax.legend(loc='upper right', frameon=True, fancybox=True, shadow=True)
    ax.grid(True, alpha=0.3, zorder=1)
    ax.set_aspect('equal', adjustable='box')
    
    # Calculate statistics
    total = len(categories)
    excellent_count = categories.count('Excellent')
    good_count = categories.count('Good')
    poor_count = categories.count('Poor')
    within_pct = (sum(within_interval) / total) * 100
    
    # Add statistics text box
    stats_text = f"""
    Accuracy Summary:
    
    Excellent: {excellent_count} ({excellent_count/total*100:.1f}%)
    Good: {good_count} ({good_count/total*100:.1f}%)
    Poor: {poor_count} ({poor_count/total*100:.1f}%)

    Within Interval: {within_pct:.1f}%
    Total: {total:,} properties
    
    """
    
    ax.text(0.007, 0.24, stats_text, transform=ax.transAxes, fontsize=10,
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9))
    
    # Set map bounds
    if len(lats) > 0 and len(lons) > 0:
        lat_margin = (max(lats) - min(lats)) * 0.05
        lon_margin = (max(lons) - min(lons)) * 0.05
        ax.set_xlim(min(lons) - lon_margin, max(lons) + lon_margin)
        ax.set_ylim(min(lats) - lat_margin, max(lats) + lat_margin)
    
    plt.tight_layout()
    
    # Create statistics dictionary
    stats = {
        'total': total,
        'excellent': excellent_count,
        'good': good_count,
        'poor': poor_count,
        'within_interval_pct': within_pct
    }
    
    # Print summary
    print(f"\n=== {dataset_type.title()} Set Results ===")
    print(f"🟢 Excellent: {excellent_count} ({excellent_count/total*100:.1f}%)")
    print(f"🟡 Good: {good_count} ({good_count/total*100:.1f}%)")
    print(f"🔴 Poor: {poor_count} ({poor_count/total*100:.1f}%)")
    print(f"Within {round((quantiles[1] - quantiles[0]) * 100, 2)}% Interval: {within_pct:.1f}%")
    return fig, ax, stats

fig, ax, stats = plot_predictions_on_king_county(
    model=best_qrf,                    # Your trained model
    X_data=X_test_transformed_small,   # Your preprocessed features
    y_actual=y_test,                   # Your actual prices
    original_data=train_data,          # Original data with lat/lon
    dataset_type='test',
    quantiles=[0.05, 0.95],           # 90% confidence interval
    figsize=(16, 14),
    point_size=25,
    alpha=0.8
)
plt.show()

Downloading King County boundary from OpenStreetMap...
Making predictions for 20858 properties...

=== Test Set Results ===
🟢 Excellent: 16762 (80.4%)
🟡 Good: 3439 (16.5%)
🔴 Poor: 657 (3.1%)
Within 90.0% Interval: 96.9%

Column Name	Description
id	record identifier
sale_date	Close-of-escrow date for the recorded sale (YYYY-MM-DD).
sale_price	Final purchase price for the transaction, expressed in US dollars.
sale_nbr	Coded reason for the sale (e.g., 1 = full-value arms-length transfer, 4 = partial interest, 7 = quit-claim, etc.).
sale_warning	Quality flag raised by the assessor when the price appears non-market (auction, related-party transfer, eminent domain, etc.). Blank = no concern.
join_status	How the record joined to the master parcel table at the time the dataset was assembled (`new`, `nochg`, `rebuilt - before`, etc.).
join_year	Calendar year of the most recent successful join to the master table.
latitude	Geographic centroid latitude of the parcel (WGS 84).
longitude	Geographic centroid longitude of the parcel (WGS 84).
area	Assessorial "area number" used by King County for grouping neighbourhoods that share similar market characteristics.
city	Incorporated city (or "KING COUNTY" for the unincorporated area) in which the parcel lies.
zoning	Current primary zoning designation on the parcel (e.g., `R-8`, `LR3 (M)`, `RA5`).
subdivision	Plat, condominium, or short-plat name recorded with the county auditor.
present_use	Numeric land-use code describing how the property is currently used (e.g., 1 = single-family, 2 = multi-family 2–4 units, 29 = town-house).
land_val	Most recent assessor land valuation (USD).
imp_val	Most recent assessor improvement (buildings & fixtures) valuation (USD).
year_built	Year the principal structure was originally constructed (0 when missing).
year_reno	Year of the last permitted remodel/addition (if any)
sqft_lot	Lot size in square feet taken from the recorded legal description or survey.
sqft	Total finished living-area square footage (above- and below-grade).
sqft_1	Finished square footage above grade on the 1st floor.
sqft_fbsmt	Finished square footage of the basement (0 when no finished basement).
grade	Assessor construction-quality "grade" (1 = low, 13 = mansion-quality; most houses are 6–9).
fbsmt_grade	Construction quality of the finished basement area (same 1–13 scale; 0 when no finished basement).
condition	Assessor physical condition code (1 = poor, 9 = excellent; 0 = not rated).
stories	Number of full stories counted above grade (split-levels show as 1).
beds	Legal bedroom count.
bath_full	Number of full bathrooms (sink + toilet + tub/shower).
bath_3qtr	Number of ¾ bathrooms (sink + toilet + shower only).
bath_half	Number of half bathrooms (sink + toilet only).
garb_sqft	Finished area in a basement garage or boat-storage bay (square feet).
gara_sqft	Finished area in an attached or detached garage (square feet).
wfnt	Waterfront indicator (0 = no waterfront access; 1-9 levels of waterfront proximity/quality).
golf	1 = parcel's primary outlook is a golf course; 0 = no golf-course view.
greenbelt	1 = parcel abuts a protected greenbelt, park or open space; 0 = no greenbelt adjacency.
noise_traffic	Noise level assessment from traffic, airports, and rail sources (0-3 scale); 0 = typical noise, 3 = High noise exposure
view_rainier	View quality of Mt. Rainier (0 = no view, 1-4 = increasing view quality)
view_olympics	View quality of the Olympic Mountains (0-4 scale)
view_cascades	View quality of the Cascade Mountains (0-4 scale)
view_territorial	Quality of broad territorial (land) view (0-4 scale)
view_skyline	View quality of city skyline (Seattle/Bellevue) (0-4 scale)
view_sound	View quality of Puget Sound (0-4 scale)
view_lakewash	View quality of Lake Washington (0-4 scale)
view_lakesamm	View quality of Lake Sammamish (0-4 scale)
view_otherwater	View quality of other water bodies (0-4 scale)
view_other	Quality of other premium views (0-4 scale)
submarket	Letter code grouping neighbourhoods into broader sub-markets used by local appraisers (A = prime lake-front, B = in-city view areas, …, N = rural east county, etc.).

	id	sale_price	sale_nbr	join_year	latitude	longitude	area	present_use	land_val	imp_val	year_built	year_reno	sqft_lot	sqft	sqft_1	sqft_fbsmt	grade	fbsmt_grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	garb_sqft	gara_sqft	wfnt	golf	greenbelt	noise_traffic	view_rainier	view_olympics	view_cascades	view_territorial	view_skyline	view_sound	view_lakewash	view_lakesamm	view_otherwater	view_other
count	200000.000000	2.000000e+05	157818.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	2.000000e+05	2.000000e+05	200000.000000	200000.000000	2.000000e+05	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.00000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000
mean	99999.500000	5.841495e+05	2.162599	2023.403600	47.549248	-122.210416	48.644215	4.108860	4.601691e+05	4.917715e+05	1974.184760	59.468830	1.378310e+04	2120.679850	1251.284280	293.238535	7.667290	2.811045	3.515745	1.523778	3.419390	1.579735	0.494115	0.493020	80.32632	274.151470	0.078620	0.006220	0.033505	0.198130	0.017940	0.053985	0.058800	0.215550	0.018425	0.055565	0.050075	0.014090	0.020875	0.013455
std	57735.171256	4.170595e+05	1.113090	6.241643	0.142710	0.140339	27.132002	7.199323	3.510444e+05	3.680505e+05	30.544426	339.334129	3.793152e+04	909.799433	468.094648	443.577947	1.153746	3.556495	0.704148	0.526367	0.897639	0.672685	0.638183	0.525635	180.13173	288.338763	0.757477	0.078622	0.179952	0.548412	0.218994	0.379119	0.381868	0.724224	0.222746	0.380011	0.353664	0.200154	0.248977	0.181147
min	0.000000	5.029300e+04	1.000000	1999.000000	47.155200	-122.527700	1.000000	2.000000	0.000000e+00	0.000000e+00	1900.000000	0.000000	3.750000e+02	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	49999.750000	3.050000e+05	1.000000	2025.000000	47.446500	-122.323800	26.000000	2.000000	2.310000e+05	2.800000e+05	1953.000000	0.000000	5.000000e+03	1460.000000	950.000000	0.000000	7.000000	0.000000	3.000000	1.000000	3.000000	1.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	99999.500000	4.599500e+05	2.000000	2025.000000	47.562800	-122.222700	48.000000	2.000000	3.770000e+05	4.090000e+05	1978.000000	0.000000	7.438000e+03	1970.000000	1200.000000	0.000000	7.000000	0.000000	3.000000	1.500000	3.000000	2.000000	0.000000	0.000000	0.00000	240.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	149999.250000	7.249500e+05	3.000000	2025.000000	47.673500	-122.121700	71.000000	2.000000	5.940000e+05	5.990000e+05	2001.000000	0.000000	1.022000e+04	2610.000000	1470.000000	570.000000	8.000000	7.000000	4.000000	2.000000	4.000000	2.000000	1.000000	1.000000	0.00000	480.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	199999.000000	2.999950e+06	11.000000	2025.000000	47.777800	-121.161300	100.000000	29.000000	1.386400e+07	1.006700e+07	2025.000000	2024.000000	2.310573e+06	13540.000000	7760.000000	5480.000000	13.000000	13.000000	5.000000	4.500000	14.000000	9.000000	8.000000	12.000000	12740.00000	4404.000000	9.000000	1.000000	1.000000	3.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000

	sale_date	sale_price
sale_date
1999	1999-07-15	228950.0
2000	2000-06-15	249950.0
2001	2001-07-15	259950.0
2002	2002-06-15	278000.0
2003	2003-07-15	289980.0
2004	2004-07-15	323434.0
2005	2005-07-15	366874.0
2006	2006-06-15	412000.0
2007	2007-06-15	450000.0
2008	2008-06-15	428000.0
2009	2009-07-15	395000.0
2010	2010-06-15	390080.0
2011	2011-07-15	385000.0
2012	2012-07-15	395000.0
2013	2013-07-15	425000.0
2014	2014-07-15	454100.0
2015	2015-07-15	483000.0
2016	2016-07-15	535000.0
2017	2017-07-15	610000.0
2018	2018-06-15	650000.0
2019	2019-07-15	655000.0
2020	2020-07-15	705000.0
2021	2021-07-15	810000.0
2022	2022-06-15	889000.0
2023	2023-06-15	860000.0
2024	2024-07-15	925000.0
2025	2025-01-15	849950.0

Project Overview¶

Project Goals¶

Methodology¶

About the Data¶

Data Dictionary¶

Data Challenges¶

1. Setup and Configuration¶

Utility Functions¶

2. Data Loading and Initial Exploration¶

3. Data Preprocessing and Cleaning¶

Admin Data¶

Drop records with join-year of 1999 as they likely contain outdated assessment information¶

Sale Data¶

Use sale_nbr codes to filter for only Good Sale (i.e., fair market): 00, 01, 06, and 11.¶

Geo Data¶

Legal Data¶

Assesor Features¶

Data in the file is from the most recent assessment and not necessarily refective of the property at sale_date.¶

Property Characteristic Features¶

Trim dataset down by removing features as indicated by above analysis.¶

4. Feature Engineering¶

Spatial Feature Engineering¶

Temporal Feature Engineering¶

Property Characteristic Enhancement¶

Market Context Features¶

5. Model Development¶

Quantile Regression Forest¶

5.1 Model Evaluation¶

6. Conclusion and Additional Observations¶

	id	sale_date	sale_price	sale_nbr	sale_warning	join_status	join_year	latitude	longitude	area	city	zoning	subdivision	present_use	land_val	imp_val	year_built	sqft_lot	sqft	sqft_1	sqft_fbsmt	grade	fbsmt_grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	gara_sqft	noise_traffic	view_lakewash	submarket
0	0	2014-11-15	236000	2.0		nochg	2025	47.2917	-122.3658	53	FEDERAL WAY	RS7.2	ALDERWOOD SOUTH DIV NO. 02	2	167000	372000	1975	10919	1560	1560	0	7	0	5	1.0	3	1	1	0	500	0	0	I
1	1	1999-01-15	313300	NaN	26	nochg	2025	47.6531	-122.1996	74	KIRKLAND	RS 8.5	WILDWOOD LANE NO. 03	2	1184000	598000	1962	8900	2040	1220	820	7	7	4	1.0	3	1	1	1	0	0	1	Q
2	2	2006-08-15	341000	1.0		nochg	2025	47.4733	-122.1901	30	RENTON	R-8	FALCON RIDGE (CEDAR RIDGE)	2	230000	356000	1986	4953	1640	820	0	7	0	3	2.0	3	2	0	1	480	0	0	K
3	3	1999-12-15	267000	1.0		nochg	2025	47.4739	-122.3295	96	BURIEN	RS-7200	OLYMPIC VUE ESTATES	2	190000	518000	1998	6799	2610	1010	500	8	7	3	2.0	4	2	0	1	530	1	0	G
4	4	2018-07-15	1650000	2.0		miss99	2025	47.7516	-122.1222	36	KING COUNTY	RA2.5	HOLLYWOOD HILL HIGHLANDS	2	616000	1917000	1998	31687	4040	3640	0	12	0	3	2.0	4	2	1	1	810	0	0	P

	id	sale_date	sale_price	sale_nbr	join_status	join_year	latitude	longitude	area	city	zoning	subdivision	present_use	land_val	imp_val	year_built	sqft_lot	sqft	sqft_1	sqft_fbsmt	grade	fbsmt_grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	gara_sqft	submarket
62015	62015	2006-01-15	463000	1.0	nochg	2025	47.1712	-121.9123	40	KING COUNTY	F	NaN	2	0	0	1997	35996	2370	1500	0	8	0	3	2.0	3	1	1	0	0	M
75626	75626	2023-10-15	720000	3.0	nochg	2025	47.1712	-121.9123	40	KING COUNTY	F	NaN	2	0	0	1997	35996	2370	1500	0	8	0	3	2.0	3	1	1	0	0	M
12624	12624	2015-09-15	317000	1.0	nochg	2025	47.1767	-122.0249	40	KING COUNTY	A35	GLACIER VISTA DIV NO. 03	2	135000	399000	1975	19465	1450	1450	0	7	0	5	1.0	3	1	1	0	500	M
59737	59737	2023-07-15	600000	2.0	nochg	2025	47.1767	-122.0249	40	KING COUNTY	A35	GLACIER VISTA DIV NO. 03	2	135000	399000	1975	19465	1450	1450	0	7	0	5	1.0	3	1	1	0	500	M
62451	62451	2021-08-15	505000	2.0	nochg	2025	47.1772	-122.0262	40	KING COUNTY	A35	GLACIER VISTA DIV NO. 03	2	147000	394000	1974	19465	1750	1750	0	7	0	4	1.0	3	1	1	0	510	M
169708	169708	2004-04-15	245000	1.0	nochg	2025	47.1772	-122.0262	40	KING COUNTY	A35	GLACIER VISTA DIV NO. 03	2	147000	394000	1974	19465	1750	1750	0	7	0	4	1.0	3	1	1	0	510	M
81509	81509	2022-04-15	1510000	4.0	new	2025	47.1774	-122.0112	40	KING COUNTY	RA10	OSCEOLA ADD	2	206000	948000	2006	42148	6970	2040	1730	9	9	3	2.5	4	2	2	1	1140	M
140601	140601	2013-01-15	587500	3.0	new	2025	47.1774	-122.0112	40	KING COUNTY	RA10	OSCEOLA ADD	2	206000	948000	2006	42148	6970	2040	1730	9	9	3	2.5	4	2	2	1	1140	M
3338	3338	2024-11-15	1349000	4.0	nochg	2025	47.1794	-121.9727	40	KING COUNTY	A35	NaN	2	0	0	1989	272599	4500	2210	0	9	0	4	2.0	4	2	2	1	980	M
43973	43973	2002-11-15	465600	1.0	nochg	2025	47.1794	-121.9727	40	KING COUNTY	A35	NaN	2	0	0	1989	272599	4500	2210	0	9	0	4	2.0	4	2	2	1	980	M

		sale_price			year_reno			condition			grade
		min	median	max	min	median	max	min	median	max	min	median	max
join_status	join_year
demo	1999	52400	410000.0	2975000	0	0.0	1998	1	3.0	5	1	7.0	12
miss99	2025	50500	450000.0	2995000	0	0.0	2023	1	3.0	5	4	8.0	13
new	2025	50667	555000.0	2998000	0	0.0	2024	2	3.0	5	6	8.0	13
nochg	2025	50293	415000.0	2999500	0	0.0	0	1	4.0	5	1	7.0	13
rebuilt - after	2025	50462	405000.0	2925000	0	0.0	2023	3	3.0	5	5	9.0	13
rebuilt - before	1999	65000	975000.0	2999950	0	0.0	1998	1	3.0	5	1	7.0	12
reno - after	1999	50300	385000.0	2770000	0	0.0	1999	1	3.0	5	3	7.0	13
reno - before	1999	54000	550000.0	2998000	0	1984.0	1999	1	3.0	5	3	7.0	13
reno - before	2025	70000	837500.0	2998000	1999	2006.0	2023	2	3.0	5	5	8.0	13

	Column	dtype	min	max	Total Outliers (%)	Lower Outliers (%)	Upper Outliers (%)
4	garb_sqft	int64	0.00	4000.00	22.72%	0.00%	22.72%
5	sqft_lot	int64	381.00	2076940.00	10.78%	0.00%	10.78%
1	sqft_1	int64	80.00	7760.00	3.05%	0.05%	3.00%
0	sqft	int64	200.00	13310.00	2.03%	0.00%	2.03%
2	sqft_fbsmt	int64	0.00	5110.00	1.88%	0.00%	1.88%
3	gara_sqft	int64	0.00	4404.00	0.23%	0.00%	0.23%

	id	sale_date	join_year	latitude	longitude	area	present_use	land_val	imp_val	year_built	year_reno	sqft_lot	sqft	sqft_1	sqft_fbsmt	grade	fbsmt_grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	garb_sqft	gara_sqft	wfnt	golf	greenbelt	noise_traffic	view_rainier	view_olympics	view_cascades	view_territorial	view_skyline	view_sound	view_lakewash	view_lakesamm	view_otherwater	view_other	adjusted_sale_price	sale_warning_1	sale_warning_2	sale_warning_3	sale_warning_4	sale_warning_5	sale_warning_6	sale_warning_7	sale_warning_8	sale_warning_9	sale_warning_10	sale_warning_11	sale_warning_12	sale_warning_13	sale_warning_14	sale_warning_15	sale_warning_16	sale_warning_17	sale_warning_18	sale_warning_19	sale_warning_20	sale_warning_21	sale_warning_22	sale_warning_23	sale_warning_24	sale_warning_25	sale_warning_26	sale_warning_27	sale_warning_28	sale_warning_29	sale_warning_30	sale_warning_31	sale_warning_32	sale_warning_33	sale_warning_34	sale_warning_35	sale_warning_36	sale_warning_37	sale_warning_38	sale_warning_39	sale_warning_40	sale_warning_41	sale_warning_42	sale_warning_43	sale_warning_44	sale_warning_45	sale_warning_46	sale_warning_47	sale_warning_48	sale_warning_49	sale_warning_50	sale_warning_51	sale_warning_52	sale_warning_53	sale_warning_54	sale_warning_55	sale_warning_56	sale_warning_57	sale_warning_58	sale_warning_59	sale_warning_60	sale_warning_61	sale_warning_62
count	583.000000	583	583.0	583.000000	583.000000	583.000000	583.000000	5.830000e+02	5.830000e+02	583.000000	583.0	5.830000e+02	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	583.000000	5.830000e+02	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.000000	583.0	583.0	583.0	583.0	583.000000	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.000000	583.0	583.0	583.000000	583.0	583.0	583.0	583.0	583.0	583.000000	583.0	583.0	583.0	583.0	583.000000	583.000000	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.0	583.000000	583.0	583.0	583.0	583.0	583.0	583.0
mean	99570.855918	2009-03-01 20:25:06.689536768	2025.0	47.540101	-122.085634	62.660377	2.053173	6.148419e+05	1.107655e+06	1988.849057	0.0	7.223184e+04	3750.723842	2223.835334	269.939966	9.480274	1.651801	3.447684	1.728988	3.874786	2.094340	0.670669	0.871355	12.349914	1230.087479	0.324185	0.025729	0.041166	0.120069	0.051458	0.066895	0.111492	0.404803	0.017153	0.039451	0.058319	0.060034	0.085763	0.039451	1.550213e+06	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.001715	0.0	0.0	0.0	0.0	0.017153	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.044597	0.0	0.0	0.001715	0.0	0.0	0.0	0.0	0.0	0.001715	0.0	0.0	0.0	0.0	0.012007	0.003431	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.001715	0.0	0.0	0.0	0.0	0.0	0.0
min	35.000000	1999-01-15 00:00:00	2025.0	47.201600	-122.496400	1.000000	2.000000	0.000000e+00	0.000000e+00	1921.000000	0.0	5.189000e+03	840.000000	390.000000	0.000000	6.000000	0.000000	3.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	1010.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.358280e+05	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0
25%	49484.000000	2002-06-30 00:00:00	2025.0	47.395800	-122.148750	47.000000	2.000000	3.400000e+05	6.135000e+05	1985.000000	0.0	1.940800e+04	2845.000000	1588.000000	0.000000	8.000000	0.000000	3.000000	1.000000	3.000000	2.000000	0.000000	1.000000	0.000000	1055.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	9.235880e+05	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0
50%	97186.000000	2006-09-15 00:00:00	2025.0	47.566400	-122.076000	66.000000	2.000000	5.250000e+05	9.870000e+05	1990.000000	0.0	3.560800e+04	3590.000000	2010.000000	0.000000	10.000000	0.000000	3.000000	2.000000	4.000000	2.000000	1.000000	1.000000	0.000000	1120.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.320076e+06	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0
75%	149202.000000	2016-02-14 00:00:00	2025.0	47.669150	-122.025950	72.000000	2.000000	6.880000e+05	1.439000e+06	1997.000000	0.0	7.838800e+04	4410.000000	2620.000000	0.000000	11.000000	0.000000	4.000000	2.000000	4.000000	2.000000	1.000000	1.000000	0.000000	1270.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.945776e+06	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0
max	199962.000000	2024-07-15 00:00:00	2025.0	47.776400	-121.705100	100.000000	29.000000	5.591000e+06	6.653000e+06	2022.000000	0.0	1.092431e+06	13310.000000	7760.000000	3730.000000	13.000000	11.000000	5.000000	2.500000	7.000000	6.000000	4.000000	3.000000	1200.000000	4404.000000	9.000000	1.000000	1.000000	3.000000	4.000000	3.000000	4.000000	4.000000	2.000000	3.000000	4.000000	4.000000	4.000000	4.000000	5.973216e+06	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.000000	0.0	0.0	0.0	0.0	1.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.000000	0.0	0.0	1.000000	0.0	0.0	0.0	0.0	0.0	1.000000	0.0	0.0	0.0	0.0	1.000000	1.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.000000	0.0	0.0	0.0	0.0	0.0	0.0
std	58692.390383	NaN	0.0	0.155294	0.114369	21.320205	1.130149	5.074465e+05	6.969236e+05	14.305859	0.0	1.045035e+05	1432.813786	885.067103	644.425841	1.449922	3.485935	0.592267	0.443897	0.818451	0.824675	0.767430	0.596887	102.272131	339.842076	1.573912	0.158462	0.198845	0.425952	0.398572	0.406965	0.536876	0.996960	0.184579	0.315936	0.366129	0.428573	0.520560	0.326632	9.133677e+05	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.041416	0.0	0.0	0.0	0.0	0.129952	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.206594	0.0	0.0	0.041416	0.0	0.0	0.0	0.0	0.0	0.041416	0.0	0.0	0.0	0.0	0.109010	0.058520	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.041416	0.0	0.0	0.0	0.0	0.0	0.0

	id	sale_date	sale_nbr	sale_warning	join_status	join_year	latitude	longitude	area	city	zoning	subdivision	present_use	land_val	imp_val	year_built	sqft_lot	sqft	sqft_1	sqft_fbsmt	grade	fbsmt_grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	gara_sqft	golf	noise_traffic	view_territorial	submarket	adjusted_sale_price	sale_warning_10	sale_warning_26	zoning_category
75850	75850	2004-07-15	partial_split		new	2025	47.6003	-122.0071	35	SAMMAMISH	R4	BEAVERDAM DIV NO. 01	2	1261000	3388000	1999	77101	10380	5270	5110	13	12	3	1.0	6	2	3	1	790	1	0	0	O	6083831	0	0	Residential Zones
89377	89377	2004-09-15	standard	26	nochg	2025	47.6271	-122.3149	13	SEATTLE	NR3	CAPITOL HILL	2	1744000	4223000	1914	13744	10950	3770	800	13	8	5	2.5	13	9	3	1	0	0	1	0	D	3871529	0	1	Neighborhood Residential
100476	100476	2003-03-15	partial_split		nochg	2025	47.4398	-122.0240	66	KING COUNTY	RA5	WEBSTER LAKE ESTATES	2	507000	2649000	1998	206038	13310	5290	3730	12	10	3	2.0	5	2	4	1	1570	0	0	0	K	5392654	0	0	Residential Zones
104430	104430	1999-03-15	standard		nochg	2025	47.4419	-122.0136	66	KING COUNTY	RA5	NaN	2	695000	2101000	1998	102366	10150	4950	2490	12	10	3	2.0	4	4	1	2	1370	0	0	3	K	4451419	0	0	Residential Zones
128359	128359	2000-03-15	standard	10	new	2025	47.5945	-122.2066	92	BELLEVUE	R-1.8	NaN	2	3204000	6653000	2001	65775	11400	7760	320	13	9	3	2.0	5	5	0	2	1290	0	0	0	S	1776733	1	0	Industrial and Other

	id	sale_date	sale_nbr	sale_warning	join_status	join_year	latitude	longitude	area	city	zoning	subdivision	present_use	land_val	imp_val	year_built	sqft_lot	sqft	sqft_1	grade	condition	stories	beds	bath_full	bath_3qtr	bath_half	gara_sqft	noise_traffic	view_rainier	view_cascades	view_territorial	submarket	adjusted_sale_price	sale_warning_26	sale_warning_60	zoning_category
3794	3794	2004-06-15	standard		nochg	2025	47.5750	-122.1424	31	BELLEVUE	R-5	EASTGATE ADD DIV A	2	900000	35000	1954	10400	1050	1160	7	4	1.0	3	1	1	0	0	1	0	0	0	R	525421	0	0	Residential Zones
4066	4066	2018-06-15	standard		nochg	2025	47.2486	-122.0010	40	KING COUNTY	RA2.5	NaN	2	301000	635000	1988	98881	2360	2660	8	3	1.0	2	2	0	1	790	0	3	2	2	M	912977	0	0	Residential Zones
7382	7382	2004-06-15	partial_split		nochg	2025	47.4862	-122.3303	96	BURIEN	RS-7200	CEDARHURST DIV NO. 01	2	201000	184000	1963	6600	620	850	6	3	1.0	2	1	0	0	0	0	0	0	0	G	285387	0	0	Residential Zones
10650	10650	2015-06-15	partial_split		nochg	2025	47.3729	-122.2780	26	KENT	SR-6	KENTWOOD GLEN NO. 02	2	151000	383000	1967	8480	1650	1940	7	4	1.0	3	2	0	0	0	0	0	0	0	I	357102	0	0	Special Use Zones
12230	12230	2017-10-15	standard	26	nochg	2025	47.5076	-122.1415	66	KING COUNTY	RA5	MAY VALLEY DIV NO. 02	2	253000	394000	1965	16400	1300	1640	7	4	1.0	3	1	0	1	0	0	0	0	0	K	525427	1	0	Residential Zones
21623	21623	2010-05-15	standard	60	nochg	2025	47.6784	-122.1602	93	REDMOND	NR	BURKE-FARRARS KIRKLAND DIV NO. 12	2	775000	1000	1962	16900	920	1300	6	4	1.0	2	1	0	0	0	3	0	0	0	Q	501974	0	1	Neighborhood Residential
22436	22436	2004-05-15	standard		nochg	2025	47.6917	-122.2015	74	KIRKLAND	RS 7.2	BURKE-FARRARS KIRKLAND DIV NO. 27	2	1058000	241000	1969	7200	1250	1400	7	3	1.0	3	1	1	0	390	0	0	0	0	Q	772093	0	0	Industrial and Other
26605	26605	2003-12-15	partial_split		nochg	2025	47.6960	-122.1139	72	REDMOND	NR	MESA VERDE DIV NO. 01	2	664000	98000	1969	7360	1060	1130	7	3	1.0	3	1	0	0	0	0	0	0	0	P	528030	0	0	Neighborhood Residential
38742	38742	2009-10-15	standard		nochg	2025	47.4735	-122.3438	96	BURIEN	RS-7200	LEONARD ADD	2	226000	286000	1953	8360	1420	1500	7	3	1.0	3	1	0	0	0	0	0	0	0	G	455976	0	0	Residential Zones
39678	39678	2013-06-15	partial_split		nochg	2025	47.7516	-122.3567	1	SHORELINE	R6	BALCHS ALBERT PARK HIGHLANDS ADD	2	386000	288000	1955	8100	1300	1590	7	3	1.0	3	1	0	0	0	0	0	0	0	A	488164	0	0	Residential Zones

	id	sale_date	adjusted_sale_price	sale_nbr	sale_warning_26	latitude	longitude	area	city	zoning_category	year_built	sqft	sqft_1	stories	beds	bath_full	bath_3qtr	bath_half	submarket	finished_basement_type	attached_garage_type	basement_garage_type	above_typical_noise	has_view_lakewash	log_adj_sale_price	log_sqft	log_sqft_1
1	1	1999-01-15	776952	standard	1	47.6531	-122.1996	74	KIRKLAND	Industrial and Other	1962	2040	1220	1.0	3	1	1	1	Q	Medium	None	None	0	1	13.563134	7.620705	7.106606
2	2	2006-08-15	697511	partial_split	0	47.4733	-122.1901	30	RENTON	Residential Zones	1986	1640	820	2.0	3	2	0	1	K	None	Medium	None	0	0	13.455274	7.402452	6.709304
3	3	1999-12-15	662133	partial_split	0	47.4739	-122.3295	96	BURIEN	Residential Zones	1998	2610	1010	2.0	4	2	0	1	G	Small	Medium	None	1	0	13.403222	7.867106	6.917706
7	7	2001-08-15	527497	partial_split	0	47.3090	-122.3490	54	FEDERAL WAY	Industrial and Other	1985	2040	1120	2.0	3	2	0	1	I	None	Medium	None	0	0	13.175898	7.620705	7.021084
8	8	2002-01-15	534295	partial_split	0	47.4955	-122.3565	96	BURIEN	Residential Zones	1962	2180	1090	1.0	4	1	1	1	G	Medium	None	None	0	0	13.188703	7.687080	6.993933