-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask_i_pandas.py
72 lines (52 loc) · 2.22 KB
/
task_i_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
This program denormalizes electricity usage data by merging multiple data sources into a Pandas DataFrame.
It performs the following main operations:
1. Reads input data from text files
2. Merges different datasets using Pandas
3. Calculate the average electricity usage per region and dwelling type in a specific time period
4. Converts the results to a list of ElectricityRecord objects as per requirements
Author: Jacqueline Ong
Date: 12/16/2024
"""
import pandas as pd
from models import ElectricityRecord
def read_input():
area_df = pd.read_csv('Area.txt', sep=';')
date_dim_df = pd.read_csv('DateDim.txt', sep=';')
dwelling_df = pd.read_csv('Dwelling.txt', sep=',')
electricity_df = pd.read_csv('Electricity.txt'
, sep=';')
return [area_df, date_dim_df, dwelling_df, electricity_df]
# merging DataFrames using Pandas library functions
def merge(area_df: pd.DataFrame, date_dim_df: pd.DataFrame, dwelling_df: pd.DataFrame, electricity_df: pd.DataFrame) -> pd.DataFrame:
merged_df = pd.merge(electricity_df, area_df, on='AreaID', how='inner')
merged_df = pd.merge(merged_df, date_dim_df, on='DateID', how='inner')
merged_df = pd.merge(merged_df, dwelling_df, left_on='dwelling_type_id', right_on='TypeID', how='inner')
return merged_df
# select relevant columns
def transform(merged_df: pd.DataFrame):
df = merged_df[['Region', 'Area', 'year', 'month', 'dwelling_type', 'kwh_per_acc']]
df = df.sort_values(['Area', 'year', 'month', 'dwelling_type'])
return df
def make_output(denormalized_df: pd.DataFrame):
records = [
ElectricityRecord(
region=row.Region,
area=row.Area,
year=row.year,
month=row.month,
dwelling_type=row.dwelling_type,
avg_kwh_per_acc=row.kwh_per_acc
)
for row in denormalized_df.itertuples()
]
return records
if __name__ == "__main__":
input_df = read_input()
merged_df = merge(input_df[0], input_df[1], input_df[2], input_df[3])
denormalized_df = transform(merged_df)
result = make_output(denormalized_df)
print(f"Number of records: {len(result)}")
print("\nFirst 3 records:")
for record in result[:3]:
print(record)