From 8f8c3655db1236460a49ffe22b332066538eb625 Mon Sep 17 00:00:00 2001 From: zwitschi Date: Sun, 10 May 2026 12:43:11 +0200 Subject: [PATCH] refactor: replace pandas with csv module for timezone data loading --- test_print_timezones.py | 82 ++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/test_print_timezones.py b/test_print_timezones.py index a3b81c8..f82e001 100644 --- a/test_print_timezones.py +++ b/test_print_timezones.py @@ -1,6 +1,6 @@ import pytz from datetime import datetime -import pandas as pd +from csv import DictReader def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None: @@ -41,52 +41,74 @@ def load_tz_file(): "abbreviation", "time_start", "gmt_offset", "dst"] # columns to load load_columns = ["zone_name", "country_code"] - # read csv with pandas - df = pd.read_csv(timezone_file, names=timezone_names) + # read csv + with open(timezone_file, newline='') as csvfile: + reader = DictReader(csvfile, fieldnames=timezone_names) + csv = [row for row in reader] # drop all columns except load_columns - df = df[load_columns] + csv = [{k: v for k, v in row.items() if k in load_columns} for row in csv] # distinct zone_names - df = df.drop_duplicates(subset=["zone_name"]) + seen = set() + unique_csv = [] + for row in csv: + if row["zone_name"] not in seen: + seen.add(row["zone_name"]) + unique_csv.append(row) + csv = unique_csv - # reset index - df = df.reset_index(drop=True) - - return df + return csv def main(): - # read csv with pandas - df_file = load_tz_file() + # read csv file and load timezones and countries + csv = load_tz_file() # split zone_name into components by "/" - df_file[['region', 'city']] = df_file['zone_name'].str.split( - '/', expand=True, n=1) + for row in csv: + parts = row["zone_name"].split("/", 1) + row["region"] = parts[0] + row["city"] = parts[1] if len(parts) > 1 else None # drop regions with no country_code (like Etc, GMT, etc) - df_file = df_file[df_file['country_code'].notna()] + csv = [row for row in csv if row["country_code"]] + + # get all timezones from pytz and split into region and city + + tz = [{"zone_name": tz} for tz in pytz.all_timezones] - df_tz = pd.DataFrame(pytz.all_timezones) - # rename column to zone_name - df_tz = df_tz.rename(columns={0: 'zone_name'}) # split zone_name into components by "/" - df_tz[['region', 'city']] = df_tz['zone_name'].str.split( - '/', expand=True, n=1) + for row in tz: + parts = row["zone_name"].split("/", 1) + row["region"] = parts[0] + row["city"] = parts[1] if len(parts) > 1 else None # drop regions with no city (like UTC, GMT, etc) - df_tz = df_tz[df_tz['city'].notna()] + tz = [row for row in tz if row["city"]] # drop rows where region is 'Etc' - df_tz = df_tz[df_tz['region'] != 'Etc'] + tz = [row for row in tz if row["region"] != "Etc"] + + # join data on region and city + timezones = [] + for tz_row in tz: + for csv_row in csv: + if tz_row["region"] == csv_row["region"] and tz_row["city"] == csv_row["city"]: + timezones.append({ + "zone_name": tz_row["zone_name"], + "country_code": csv_row["country_code"], + "region": tz_row["region"], + "city": tz_row["city"], + }) + break - # join dataframes on region and city - df_merged = pd.merge(df_file, df_tz, on=[ - 'region', 'city'], how='inner', indicator=True) # reorder columns - df_merged = df_merged[['region', 'city', 'country_code']] - # print merged dataframe - print(f"Merged timezones: {len(df_merged)}") - print(df_merged.sample(20).to_string(index=False)) - regions = df_merged['region'].unique() + timezones = [{k: row[k] for k in ['region', 'city', 'country_code']} + for row in timezones] + + # print merged data + print(f"Merged timezones: {len(timezones)}") + print(timezones[:20]) + regions = set(row['region'] for row in timezones) for region in regions: - df_region = df_merged[df_merged['region'] == region] + df_region = [row for row in timezones if row['region'] == region] print(f"{len(df_region)} merged in {region}")