-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression.py
133 lines (103 loc) · 4.32 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import curses
from sklearn.preprocessing import OrdinalEncoder as oe
from sklearn.linear_model import LinearRegression as lr
from kaggle_connect import kaggle_connect as kc
# Auxiliary functions
def encode_column(df, column_name, ordinal_categories):
encoder = oe(categories=[ordinal_categories])
df['new_' + column_name] = encoder.fit_transform(df[[column_name]])
df.drop(columns=[column_name], inplace=True)
return df
def get_column_input(df, prompt):
"""
Prompts the user to input a column name and suggests matches if the input is a substring of column names.
"""
while True:
column_name = input(prompt)
# Filter columns containing the entered text
matching_columns = [col for col in df.columns if column_name in col]
if matching_columns:
print(f"Columns containing '{column_name}': {matching_columns}")
# Ask the user to select an exact column name from the matches
selected_column = input("Enter the exact column name from the list above: ")
if selected_column in df.columns:
return selected_column
else:
print("Invalid selection. Please choose a column from the list.")
else:
print(f"No columns found containing '{column_name}'. Please try again.")
def get_ordinal_categories(column_values):
"""
Returns unique values sorted if they are ordinal.
If the order is known beforehand, this function can be customized.
"""
return sorted(column_values)
# Create a new function so we can access kaggle_connect file
def run_kaggle_download():
# Wrapper function to run Kaggle connect using curses.
return curses.wrapper(kc)
#Main
data = run_kaggle_download()
df = pd.DataFrame(data)
# Show the number of unique values per column
print(df.nunique())
# Select column to encode
values = get_column_input(df, "Enter the name of the column to encode: ")
ordinal_categories = get_ordinal_categories(df[values].unique())
data = encode_column(data, values, ordinal_categories)
#Select column to calculate the average
value_media = input("Enter the name of the columns to average: ")
score_columns = [col for col in data.columns if col.endswith(value_media)]
data[value_media] = round(data[score_columns].sum(axis=1) / len(score_columns))
data.drop(columns=score_columns, inplace=True)
# Linear regression model
x = data[['new_' + values]]
y = data[value_media]
model = lr()
model.fit(x, y)
y_pred = model.predict(x)
# Create a DataFrame for the table data
unique_values = x['new_' + values].unique()
table_data = pd.DataFrame({
values: x.values.flatten(), # Unique values of the encoded column
value_media+'_prediction': y_pred.round(1) # Predicted values
})
sorted_table= table_data.sort_values(by=values, ascending=True).drop_duplicates()
value_names = []
# Iterate over each unique value in the 'values' column of the sorted table.
for idx, i in enumerate(sorted_table[values]):
a = input(f"Enter a value for element {idx} (current: {i}): ")
value_names.append(a)
#Plot the data and the regression line
plt.plot(x, y_pred, color='red', linewidth=2, label='Regression Line')
# Step 4: Add points on the regression line
plt.scatter(x, y_pred, color='red', label='Points on the Regression Line')
# Create an example table with additional information
# You can use your own data or relevant statistics
table_data_d = {
'Value': sorted_table[values],
'Description': value_names,
'Prediction': sorted_table[value_media+'_prediction']
}
# Convert the dictionary into a DataFrame for easier handling
df_table = pd.DataFrame(table_data)
table = plt.table(cellText=df_table.values,
colLabels=df_table.columns,
cellLoc='center',
loc='bottom',
bbox=[0.1, -0.5, 0.8, 0.3]) # Adjust the position and size
# Configuration to avoid visualization issues
table.auto_set_font_size(False)
table.set_fontsize(10) # Ensure the text is readable
# Adjust the background of the plot
plt.gca().set_facecolor('white')
# Adjust the layout to give space for the table
plt.subplots_adjust(left=0.1, bottom=0.5)
plt.xlabel(values)
plt.ylabel(value_media+'_prediction')
plt.title("Linear Regression")
plt.legend()
plt.show()