import numpy as np, pandas as pd, cv2, random, matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from sklearn.model_selection import train_test_split

image_size = 30  # Size of the generated images (30x30 pixels)
min_radius = 3  # Minimum circle radius in pixels
max_radius = image_size // 2 - 2  # Maximum circle radius, ensuring circles fit within the image
total_size = 2200  # Total number of images to generate
noise_level = 0.8  # Noise level to add to the images

csv_filename = 'generated_images_masks.csv'  # Filename for saving the generated data as a CSV

def generate_image_with_metadata(image_size, radius, x, y, noise_level=0.8):
  """
  Generate a binary mask with a circle, add random noise, attach metadata (radius, x, y), and return the flattened mask.

  noise_level: Controls the amount of random noise added (default is 0.8).
  """
  # Create a black background image
  img = Image.new('1', (image_size, image_size), color=0)
  draw = ImageDraw.Draw(img)

  # Draw the circle on the binary mask
  draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=1)

  # Add random noise to the image
  noise = np.random.rand(image_size, image_size) > noise_level  # Generate random noise
  noisy_mask = np.logical_and(img, noise).astype(np.float32)  # Apply noise to the mask

  # Flatten the noisy mask into a 1D array
  flattened_mask = noisy_mask.flatten()

  # Store the metadata (radius, x, y)
  metadata = [radius, x, y]

  # Combine metadata with the binary mask
  return np.array(metadata + flattened_mask.tolist())


def generate_images_with_masks(total_size, image_size, min_radius, max_radius):
  """
  Generate multiple images with masks by calling the image generation function.
  """
  data = []  # List to store masks and metadata

  for _ in range(total_size):
    # Randomly choose the radius and center (x, y)
    radius = random.randint(min_radius, max_radius)
    x = random.randint(radius + 1, image_size - radius - 1)
    y = random.randint(radius + 1, image_size - radius - 1)

    # Generate a single image with mask and metadata
    single_image_data = generate_image_with_metadata(image_size, radius, x, y, noise_level)
    data.append(single_image_data)

  return np.array(data)

# Generate images with masks (returns a dataset including radius, x, y, and pixel values)
data = generate_images_with_masks(total_size, image_size, min_radius, max_radius)

# Define column names for the DataFrame: radius, x, y, and pixel values
columns = ['radius', 'x', 'y'] + [f'Pixel_{i}' for i in range(image_size * image_size)]

# Create a DataFrame from the generated data with the specified columns
df = pd.DataFrame(data, columns=columns)

# Save the DataFrame to a CSV file without including the index
df.to_csv(csv_filename, index=False)

# Notify the user that the data has been saved to the specified CSV file
print(f'Data saved to {csv_filename}')

Data saved to generated_images_masks.csv

# Function to visualize the image
def visualize(image_data, image_size):
  """
  Visualize the image given an array of image data and return the plot object.

  Parameters:
  - image_data: A NumPy array containing image metadata and mask.
  - image_size: The size of the image (width and height, assumed square).

  Returns:
  - fig, ax: The matplotlib figure and axes objects for further modification.
  """
  # Extract metadata from the array (radius and circle center coordinates)
  radius = int(image_data[0])
  x = int(image_data[1])
  y = int(image_data[2])

  # Extract the binary mask from the remaining part of the array
  mask = image_data[3:].astype(np.uint8).reshape(image_size, image_size)

  # Create a grayscale image from the mask
  img = np.zeros((image_size, image_size), dtype=np.uint8)  # Initialize the image with zeros (black)
  img[mask == 1] = 255  # Set the circle area to white (255)

  # Create a figure and axis for plotting
  fig, ax = plt.subplots()
  ax.imshow(img, cmap='gray')  # Display the image in grayscale
  ax.set_title(f'Circle at center ({x}, {y}) with radius {radius}')  # Set title with circle metadata
  ax.axis('off')  # Turn off axis labels and ticks

  # Return the figure and axis for further customization
  return fig, ax

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_filename)

# Prompt the user to enter an image index and extract the corresponding row
image_index = int(input('Enter the image index: '))
image_data = df.iloc[image_index].values  # Get the data for the specified image index

# Visualize the selected image from the DataFrame
_, _ = visualize(image_data, image_size)  # Call the visualize function to display the image

# Function to solve ordinary least squares (OLS)
def solve_ols(X_train, Y_train, X_other, alpha):
  """
  Solves the OLS regression problem with regularization (RLS).

  Parameters:
  - X_train: The training input data (features).
  - Y_train: The training output data (targets).
  - X_other: The input data (features) for the other dataset (e.g., validation or test).
  - alpha: Regularization parameter.

  Returns:
  - Y_LS_train: Predicted outputs for the training data.
  - Y_LS_other: Predicted outputs for the other dataset.
  """
  # Calculate the weights (W) using the pseudo-inverse and regularization
  W = np.dot(
    np.linalg.pinv(np.dot(X_train.T, X_train) + alpha * np.identity(np.shape(X_train)[1])), np.dot(X_train.T, Y_train)
  )

  Y_LS_train = np.dot(X_train, W)
  Y_LS_other = np.dot(X_other, W)
  # Return predictions for both training and other data
  return Y_LS_train, Y_LS_other

# Function to run OLS, compute MSE, and output results and labels
def run_ols(X_train, X_other, Y_train, Y_other, alpha, augment=False):
  """
  Runs the OLS regression, computes the MSE for both training and another dataset (e.g., validation/test), and returns results.

  Parameters:
  - X_train: The training input data (features).
  - X_other: The input data (features) for the other dataset (e.g., validation or test).
  - Y_train: The training output data (targets).
  - Y_other: The output data (targets) for the other dataset (e.g., validation or test).
  - alpha: Regularization parameter (ridge regression).
  - augment: Boolean indicating whether to augment data by adding a bias term.

  Returns:
  - mse_train: Mean squared error on the training set.
  - mse_other: Mean squared error on the other dataset.
  - Y_LS_train: Predicted outputs for the training data.
  - Y_LS_other: Predicted outputs for the other dataset.
  """
  if augment:
    # Augment the data by adding a bias term (column of ones)
    X_train = augment_data(X_train)
    X_other = augment_data(X_other)

  # Perform OLS regression to get predictions
  Y_LS_train, Y_LS_other = solve_ols(X_train, Y_train, X_other, alpha)

  # Compute MSE for train and other datasets
  mse_train = compute_mse(Y_LS_train, Y_train)
  mse_other = compute_mse(Y_LS_other, Y_other)

  # Return the MSE and predictions
  return mse_train, mse_other, Y_LS_train, Y_LS_other

# Function to compute mean squared error (MSE)
def compute_mse(Y_pred, Y_true):
  """
  Computes the Mean Squared Error (MSE) between predicted and true values.

  Parameters:
  - Y_pred: Predicted values.
  - Y_true: True values.

  Returns:
  - mse: The mean squared error.
  """
  mse = np.square(np.linalg.norm(Y_pred - Y_true)) / Y_true.size
  return mse

# Function to augment data by adding a bias (for non-homogeneous models)
def augment_data(X):
  """
  Augments the data by adding a bias column (column of ones).

  Parameters:
  - X: The input data to augment.

  Returns:
  - Augmented input data with an added bias column.
  """
  ones_column = np.ones((X.shape[0], 1))
  return np.concatenate((X, ones_column), axis=1)

# Function to report true and predicted circles
def prediction_report(true_data, predicted_data, predicted_data_aug, index):
  """
  Prints a report of true and predicted circle parameters (radius and center coordinates).

  Parameters:
  - true_data: Array containing true circle parameters (radius, x, y).
  - predicted_data: Array containing predicted circle parameters using a homogeneous model.
  - predicted_data_aug: Array containing predicted circle parameters using a non-homogeneous model.
  - index: Index of the circle data to report.

  Outputs:
  - Prints the true and predicted values for the selected circle.
  """
  # Extract true and predicted values for the selected index
  true_radius, true_x, true_y = true_data[index]
  pred_radius, pred_x, pred_y = predicted_data[index]
  augp_radius, augp_x, augp_y = predicted_data_aug[index]

  # Report the true and predicted values
  print(f'True values - Radius: {true_radius}, x: {true_x}, y: {true_y}')
  print(f'Predicted homogeneous model values - Radius: {pred_radius:.2f}, x: {pred_x:.2f}, y: {pred_y:.2f}')
  print(f'Predicted non-homogeneous model values - Radius: {augp_radius:.2f}, x: {augp_x:.2f}, y: {augp_y:.2f}')

# Load dataset from the CSV file
df = pd.read_csv(csv_filename)  # Load the image data with metadata from a CSV file

# Extract features by dropping the target columns ('radius', 'x', 'y')
X = df.drop(columns=['radius', 'x', 'y']).values  # Features (only pixel data)

# Extract target values (radius, x, and y)
Y = df[['radius', 'x', 'y']].values  # Labels (radius, x, and y)

random_state = 42
train_size, valid_size, test_size = 1800, 200, 200

# for ridge regression, we will set to 0 for no regularization for OLS
alpha = 0

def split_dataset(X, Y, total_size, train_size, valid_size):
  """
  Splits the dataset into training, validation, and test sets.

  Parameters:
  X (array-like): Feature data.
  Y (array-like): Target data.
  total_size: Total number of data to consider.
  train_size: number of the data to include in the training set.
  valid_size: number of the data to include in the validation set.

  Returns:
  X_train, X_valid, X_test, Y_train, Y_valid, Y_test: Split datasets.

  """
  X = X[:total_size]
  Y = Y[:total_size]
  test_size = total_size - train_size - valid_size
  X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
  X_train, X_valid, Y_train, Y_valid = train_test_split(
    X_train_valid, Y_train_valid, test_size=valid_size, random_state=random_state
  )
  return X_train, X_valid, X_test, Y_train, Y_valid, Y_test

# Here we use the previous function to split our dataset into three parts.
X_train, X_valid, X_test, Y_train, Y_valid, Y_test = split_dataset(X, Y, total_size, train_size, valid_size)

# Run OLS regression without adding a bias term (homogeneous model)
mse_train_hom, mse_val_hom, Y_LS_train_hom, Y_LS_val_hom = run_ols(
  X_train, X_valid, Y_train, Y_valid, alpha, augment=False
)

# Output the results for the homogeneous model
print('Homogeneous Model Results:')
print(f'Training MSE: {mse_train_hom:.4f}')
print(f'Validation MSE: {mse_val_hom:.4f}')

Homogeneous Model Results:
Training MSE: 27.5233
Validation MSE: 79.8872

# Run OLS regression with a bias term added (non-homogeneous model)
mse_train_nhom, mse_val_nhom, Y_LS_train_nhom, Y_LS_val_nhom = run_ols(
  X_train, X_valid, Y_train, Y_valid, alpha, augment=True
)

# Output the results for the non-homogeneous model
print('Non-Homogeneous Model Results:')
print(f'Training MSE: {mse_train_nhom:.4f}')
print(f'Validation MSE: {mse_val_nhom:.4f}')

Non-Homogeneous Model Results:
Training MSE: 2.5647
Validation MSE: 12.2064

mse_test_nhom, _, _, _ = run_ols(X_train, X_test, Y_train, Y_test, alpha, augment=True)

print(f'Test MSE: {mse_test_nhom:.4f}')

Test MSE: 2.5647

print(f'homo ratio: {(mse_val_hom / mse_train_hom):.4f}')
print(f'non-homo ratio: {(mse_val_nhom / mse_train_nhom):.4f}')

homo ratio: 2.9025
non-homo ratio: 4.7594

# In this code you can see the predictions you've made using your model.


def get_predictions(X, Y, alpha=0, *, X_train, Y_train):
  _, Y_pred_hom = solve_ols(X_train, Y_train, X, alpha)
  X_aug = augment_data(X)
  X_train_aug = augment_data(X_train)
  _, Y_pred_nhom = solve_ols(X_train_aug, Y_train, X_aug, alpha)
  return Y_pred_hom, Y_pred_nhom

# Prompt the user to enter an index for prediction reporting
index = int(input('Enter the index for prediction: '))

# Generate and print the prediction report for the specified index
# Use the prediction_report function and index to see your predictions
Y_pred_hom, Y_pred_nhom = get_predictions(X_test, Y_test, alpha=alpha, X_train=X_train, Y_train=Y_train)
if 0 <= index < len(Y_test):
  prediction_report(Y_test, Y_pred_hom, Y_pred_nhom, index)

True values - Radius: 8.0, x: 11.0, y: 15.0
Predicted homogeneous model values - Radius: 6.83, x: 6.31, y: 13.31
Predicted non-homogeneous model values - Radius: 7.56, x: 9.78, y: 16.84

# Run OLS regression with a bias term added on the test set

X_train_aug = augment_data(X_train)
X_test_aug = augment_data(X_test)
mse_test, Y_pred_test, _, _ = run_ols(X_train_aug, X_test_aug, Y_train, Y_test, alpha, augment=True)

# Output the results for the better-performing approach
print('OLS regression with bias term (non-homogeneous model) on test set:')
print(f'Test MSE: {mse_test:.4f}')


# Calculate R-squared for each target variable
def r_squared(y_true, y_pred):
  ss_total = np.sum((y_true - np.mean(y_true, axis=0)) ** 2, axis=0)
  ss_residual = np.sum((y_true - y_pred) ** 2, axis=0)
  return 1 - (ss_residual / ss_total)


r_squared_values = r_squared(Y_test, Y_pred_test)

print('\nR-squared values:')
print(f'Radius: {r_squared_values[0]:.4f}')
print(f'X-coordinate: {r_squared_values[1]:.4f}')
print(f'Y-coordinate: {r_squared_values[2]:.4f}')

mae = np.mean(np.abs(Y_test - Y_pred_test), axis=0)

print('\nMean Absolute Error:')
print(f'Radius: {mae[0]:.4f}')
print(f'X-coordinate: {mae[1]:.4f}')
print(f'Y-coordinate: {mae[2]:.4f}')

OLS regression with bias term (non-homogeneous model) on test set:
Test MSE: 2.5647

R-squared values:
Radius: -0.0826
X-coordinate: -1.8811
Y-coordinate: -1.9353

Mean Absolute Error:
Radius: 2.8787
X-coordinate: 6.4406
Y-coordinate: 6.6767

# Specify the number of training, validation, testing samples.
train_size, valid_size, test_size = 200, 1800, 200
total_size = train_size + valid_size + test_size

# Here we use the split_dataset function to split our dataset into three parts.
X_train, X_valid, X_test, Y_train, Y_valid, Y_test = split_dataset(X, Y, total_size, train_size, valid_size)

# Run OLS regression without adding a bias term (homogeneous model)
mse_train_hom, mse_val_hom, Y_LS_train_hom, Y_LS_val_hom = run_ols(
  X_train, X_valid, Y_train, Y_valid, alpha, augment=False
)

# Output the results for the homogeneous model
print('Homogeneous Model Results:')
print(f'Training MSE: {mse_train_hom:.4f}')
print(f'Validation MSE: {mse_val_hom:.4f}')

Homogeneous Model Results:
Training MSE: 0.0000
Validation MSE: 138.5837

# Run OLS regression with a bias term added (non-homogeneous model)
mse_train_nhom, mse_val_nhom, Y_LS_train_nhom, Y_LS_val_nhom = run_ols(
  X_train, X_valid, Y_train, Y_valid, alpha, augment=True
)

# Output the results for the non-homogeneous model
print('Non-Homogeneous Model Results:')
print(f'Training MSE: {mse_train_nhom:.4f}')
print(f'Validation MSE: {mse_val_nhom:.4f}')

Non-Homogeneous Model Results:
Training MSE: 0.0000
Validation MSE: 12.4480

# In this code you can see the predictions you've made using your model.

# Prompt the user to enter an index for prediction reporting
index = int(input('Enter the index for prediction: '))

# Generate and print the prediction report for the specified index
# Use the prediction_report function and index to see your predictions

Y_pred_hom, Y_pred_nhom = get_predictions(X_test, Y_test, alpha=alpha, X_train=X_train, Y_train=Y_train)
if 0 <= index < len(Y_test):
  prediction_report(Y_test, Y_pred_hom, Y_pred_nhom, index)

True values - Radius: 8.0, x: 11.0, y: 15.0
Predicted homogeneous model values - Radius: 5.91, x: -1.07, y: 2.67
Predicted non-homogeneous model values - Radius: 7.79, x: 8.94, y: 12.32

# Run OLS regression with a bias term added on the test set

X_train_aug = augment_data(X_train)
X_test_aug = augment_data(X_test)
mse_test, Y_pred_test, _, _ = run_ols(X_train_aug, X_test_aug, Y_train, Y_test, alpha, augment=True)

# Output the results for the better-performing approach
print('OLS regression with bias term (non-homogeneous model) on test set:')
print(f'Test MSE: {mse_test:.4f}')

# Calculate R-squared for each target variable
r_squared_values = r_squared(Y_test, Y_pred_test)

print('\nR-squared values:')
print(f'Radius: {r_squared_values[0]:.4f}')
print(f'X-coordinate: {r_squared_values[1]:.4f}')
print(f'Y-coordinate: {r_squared_values[2]:.4f}')

mae = np.mean(np.abs(Y_test - Y_pred_test), axis=0)

print('\nMean Absolute Error:')
print(f'Radius: {mae[0]:.4f}')
print(f'X-coordinate: {mae[1]:.4f}')
print(f'Y-coordinate: {mae[2]:.4f}')

OLS regression with bias term (non-homogeneous model) on test set:
Test MSE: 0.0000

R-squared values:
Radius: -1.7235
X-coordinate: -0.3874
Y-coordinate: -0.4298

Mean Absolute Error:
Radius: 4.3925
X-coordinate: 4.2203
Y-coordinate: 4.3336

# Load dataset from the CSV file
df = pd.read_csv(csv_filename)  # Load the image data with metadata from a CSV file

# Extract features by dropping the target columns ('radius', 'x', 'y')
X = df.drop(columns=['radius', 'x', 'y']).values  # Features (only pixel data)

# Extract target values (radius, x, and y)
Y = df[['radius', 'x', 'y']].values  # Labels (radius, x, and y)

# Specify the number of training, validation, testing samples.
train_size, valid_size, test_size = 200, 1800, 200
total_size = train_size + valid_size + test_size

# Here we use the split_dataset function to split our dataset into three parts.
X_train, X_valid, X_test, Y_train, Y_valid, Y_test = split_dataset(X, Y, total_size, train_size, valid_size)

# Define a range of alpha values (regularization parameters) using an exponential scale
alphas = np.exp(np.arange(-2, 4.5, 0.5))

train_mse_list = []
val_mse_list = []

# Iterate over the range of alpha values
for alpha in alphas:
  # Run RLS regression with a bias term added on train and validation sets
  mse_train, mse_val, _, _ = run_ols(X_train, X_valid, Y_train, Y_valid, alpha, augment=True)

  # Store the MSE values
  train_mse_list.append(mse_train)
  val_mse_list.append(mse_val)


optimal_alpha = alphas[np.argmin(val_mse_list)]

# Plot the MSE for the non-homogeneous model across different alphas
plt.figure(figsize=(10, 6))
plt.semilogx(alphas, train_mse_list, 'b-', label='training MSE')
plt.semilogx(alphas, val_mse_list, 'r-', label='validation MSE')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Mean Squared Error')
plt.title('Training and Validation MSE vs Alpha (non-homogeneous model)')
plt.grid(True)

plt.axvline(x=optimal_alpha, color='g', linestyle='--', label=f'Optional alpha  = {optimal_alpha:.4f}')
plt.legend()
plt.show()

# Here you find the best value of lambda
print(f'optimal alpha from val: {alphas[np.argmin(val_mse_list)]:.4f}')

optimal alpha from val: 7.3891

# Run OLS regression with a bias term added (non-homogeneous model)

best_alpha = optimal_alpha
X_train_aug = augment_data(X_train)
X_test_aug = augment_data(X_test)
mse_test, Y_pred_test, _, _ = run_ols(X_train_aug, X_test_aug, Y_train, Y_test, best_alpha, augment=True)

# Output the results for the non-homogeneous model
print('OLS regression with bias term (non-homogeneous model) on test set:')
print(f'Test MSE: {mse_test:.4f}')

# Calculate R-squared for each target variable
r_squared_values = r_squared(Y_test, Y_pred_test)

print('\nR-squared values:')
print(f'Radius: {r_squared_values[0]:.4f}')
print(f'X-coordinate: {r_squared_values[1]:.4f}')
print(f'Y-coordinate: {r_squared_values[2]:.4f}')

mae = np.mean(np.abs(Y_test - Y_pred_test), axis=0)

print('\nMean Absolute Error:')
print(f'Radius: {mae[0]:.4f}')
print(f'X-coordinate: {mae[1]:.4f}')
print(f'Y-coordinate: {mae[2]:.4f}')

OLS regression with bias term (non-homogeneous model) on test set:
Test MSE: 1.1518

R-squared values:
Radius: -0.0030
X-coordinate: -2.3763
Y-coordinate: -2.4275

Mean Absolute Error:
Radius: 2.7962
X-coordinate: 7.0633
Y-coordinate: 7.2844

from scipy.fftpack import dct

# Load dataset from the CSV file
df = pd.read_csv(csv_filename)  # Load the image data with metadata from a CSV file

# Extract features by dropping the target columns ('radius', 'x', 'y')
X = df.drop(columns=['radius', 'x', 'y']).values  # Features (only pixel data)

# Extract target values (radius, x, and y)
Y = df[['radius', 'x', 'y']].values  # Labels (radius, x, and y)

# set the number DCT coefficient
n_coeffs = 100


# Define a function to transforms pixel values to a new space.
def transform(X_df: pd.DataFrame):
  X_images = X_df.reshape(-1, 30, 30)
  dct_coeffs = np.array([dct(dct(img.T, norm='ortho').T, norm='ortho') for img in X_images])

  # zig-zag scan
  def zigzag(a):
    return np.concatenate([
      np.diagonal(a[::-1, :], k)[:: (2 * (k % 2) - 1)] for k in range(1 - a.shape[0], a.shape[0])
    ])

  top_coeffs = np.array([zigzag(img)[:n_coeffs] for img in dct_coeffs])
  return pd.DataFrame(top_coeffs, columns=[f'DCT_{i}' for i in range(n_coeffs)])


# Apply the transform function to the DataFrame
processed_X_df = transform(X)

# Display the first few rows of the updated DataFrame
processed_X_df.head()

np.array(processed_X_df).shape, np.array(X).shape

((2200, 100), (2200, 900))

# Specify the number of training, validation, testing samples.
train_size, valid_size, test_size = 200, 1800, 200
total_size = train_size + valid_size + test_size

########################################################
########################################################
#### Please take note that we insert processed_X_df ####
#### as the new input for splitting                 ####
########################################################
########################################################
Z_train, Z_valid, Z_test, Y_train, Y_valid, Y_test = split_dataset(
  processed_X_df, Y, total_size, train_size, valid_size
)

# Define a range of alpha values (regularization parameters) using an exponential scale
alphas = np.exp(np.arange(-2, 4.5, 0.5))

train_mse_list = []
val_mse_list = []

# Iterate over the range of alpha values
for alpha in alphas:
  # Run RLS regression with a bias term added on train and validation sets
  mse_train, mse_val, _, _ = run_ols(Z_train, Z_valid, Y_train, Y_valid, alpha, augment=True)

  # Store the MSE values
  train_mse_list.append(mse_train)
  val_mse_list.append(mse_val)

# Plot the MSE for the non-homogeneous model across different alphas
optimal_alpha = alphas[np.argmin(val_mse_list)]

# Plot the MSE for the non-homogeneous model across different alphas
plt.figure(figsize=(10, 6))
plt.semilogx(alphas, train_mse_list, 'b-', label='training MSE')
plt.semilogx(alphas, val_mse_list, 'r-', label='validation MSE')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Mean Squared Error')
plt.title('Training and Validation MSE vs Alpha with DCT preprocessing (non-homogeneous model)')
plt.grid(True)

plt.axvline(x=optimal_alpha, color='g', linestyle='--', label=f'Optional alpha  = {optimal_alpha:.4f}')
plt.legend()
plt.show()

# Here you find the best value of lambda
best_alpha = alphas[np.argmin(val_mse_list)]

# Run OLS regression with a bias term added (non-homogeneous model)
Z_train_aug = augment_data(Z_train)
Z_test_aug = augment_data(Z_test)
mse_test, Y_pred_test, _, _ = run_ols(Z_train_aug, Z_test_aug, Y_train, Y_test, best_alpha, augment=True)

# Output the results for the non-homogeneous model
print('OLS regression with bias term (non-homogeneous model) on test set:')
print(f'Test MSE: {mse_test:.4f}')

# Calculate R-squared for each target variable
r_squared_values = r_squared(Y_test, Y_pred_test)

print('\nR-squared values:')
print(f'Radius: {r_squared_values[0]:.4f}')
print(f'X-coordinate: {r_squared_values[1]:.4f}')
print(f'Y-coordinate: {r_squared_values[2]:.4f}')

mae = np.mean(np.abs(Y_test - Y_pred_test), axis=0)

print('\nMean Absolute Error:')
print(f'Radius: {mae[0]:.4f}')
print(f'X-coordinate: {mae[1]:.4f}')
print(f'Y-coordinate: {mae[2]:.4f}')

OLS regression with bias term (non-homogeneous model) on test set:
Test MSE: 3.2911

R-squared values:
Radius: -0.3946
X-coordinate: -4.1630
Y-coordinate: -4.1937

Mean Absolute Error:
Radius: 3.1738
X-coordinate: 9.0496
Y-coordinate: 9.2581

from scipy.optimize import minimize_scalar
from scipy.integrate import quad


def objective(log_a):
  a = np.exp(log_a)
  integrand = lambda x: (log_a - np.log(x)) ** 2
  integral_val, _ = quad(integrand, 2, 4)
  return integral_val


result = minimize_scalar(objective)
# Extract the optimal value
log_a_opt = result.x
a_opt = np.exp(log_a_opt)

print(f'Optimal a: {a_opt}')
print(f'Analytical a: {8 / np.exp(1)}')
print(f'Difference: {abs(a_opt - 8 / np.exp(1))}')

Optimal a: 2.9430355293715387
Analytical a: 2.9430355293715387
Difference: 0.0

	DCT_0	DCT_1	DCT_2	DCT_3	DCT_4	DCT_5	DCT_6	DCT_7	DCT_8	DCT_9	...	DCT_90	DCT_91	DCT_92	DCT_93	DCT_94	DCT_95	DCT_96	DCT_97	DCT_98	DCT_99
0	2.733333	0.212398	-0.478829	-1.411181	-0.028773	-1.421545	0.019093	0.300010	-0.105608	-0.117692	...	0.214257	0.364037	-0.200278	0.007374	0.077653	-0.667343	-0.031089	0.105552	0.007250	-0.116501
1	0.833333	-0.748780	-0.985477	0.517045	0.878510	-0.116845	0.643040	0.154633	-0.446819	-0.028776	...	-0.045015	-0.112985	0.096275	0.152201	0.233411	0.215333	-0.083753	-0.113022	0.251835	0.253050
2	1.733333	-0.666110	0.426438	-1.556544	-0.235478	-1.050712	0.477222	-0.134717	0.552684	-0.461624	...	0.352240	0.153732	0.094499	0.668909	-0.131834	0.260840	0.034612	0.031434	0.126959	-0.097651
3	3.000000	-0.388020	0.475287	-0.987118	-0.003971	-1.308943	-0.107352	-0.019110	-0.226248	0.358313	...	0.210819	-0.039888	-0.170631	-0.252521	-0.134813	0.267620	-0.517697	0.092528	0.238955	-0.126219
4	4.066667	-0.282951	0.426663	-1.597643	0.271608	-1.984372	-0.190114	0.005286	0.249626	-0.313862	...	0.246831	0.518051	-0.037595	-0.474278	0.107304	-0.085202	-0.269762	0.245302	0.187064	0.716002

Assignment 1 - Coding Section¶

Section One: Dataset¶

Importing necessary libraries¶

Parameters¶

Generating the Dataset¶

Dataset Dimensions¶

Visual Exploration of Sample Data Points¶

Section Two: Important functions¶

Section Three: Questions¶

PROBLEM 1 (20 marks): Predicting radius, x, and y using OLS¶

PART 1 (10 marks):¶

PART 2 (10 marks):¶

Compare these results with those from the previous part. Do you observe less overfitting or more overfitting? How did you arrive at this conclusion?¶

PROBLEM 2 (20 marks): Regularized Least Squares¶

PART 1 (15 marks):¶

PART 2 (5 marks):¶

PROBLEM 3 (30 marks): Preprocessing the data¶

PART 1 (5 marks):¶

PART 2 (10 marks):¶

PART 3 (15 marks):¶

question 3.¶