import numpy as np
import csv

# Read the housing data
print("Reading housing.csv...")
with open('housing.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    data = list(reader)

print(f"Original data shape: ({len(data)}, {len(header)})")

# Remove rows with missing data
data_clean = []
for row in data:
    # Check if any cell is empty or 'NA' or 'nan'
    if all(cell and cell.lower() not in ['na', 'nan', ''] for cell in row):
        data_clean.append(row)

print(f"\nData shape after removing missing values: ({len(data_clean)}, {len(header)})")
print(f"Rows removed: {len(data) - len(data_clean)}")

# Shuffle and split into train (80%) and test (20%) sets
np.random.seed(42)
indices = np.random.permutation(len(data_clean))
train_size = int(0.8 * len(data_clean))
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = [data_clean[i] for i in train_indices]
test_data = [data_clean[i] for i in test_indices]

# Save to CSV files
with open('housing_train.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(train_data)

with open('housing_test.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(test_data)

print(f"\nTrain set shape: ({len(train_data)}, {len(header)})")
print(f"Test set shape: ({len(test_data)}, {len(header)})")
print("\nFiles saved:")
print("- housing_train.csv")
print("- housing_test.csv")
