refactor: replace pandas with csv module for timezone data loading

This commit is contained in:
2026-05-10 12:43:11 +02:00
parent 584231b0df
commit 8f8c3655db
+52 -30
View File
@@ -1,6 +1,6 @@
import pytz import pytz
from datetime import datetime from datetime import datetime
import pandas as pd from csv import DictReader
def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None: def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None:
@@ -41,52 +41,74 @@ def load_tz_file():
"abbreviation", "time_start", "gmt_offset", "dst"] "abbreviation", "time_start", "gmt_offset", "dst"]
# columns to load # columns to load
load_columns = ["zone_name", "country_code"] load_columns = ["zone_name", "country_code"]
# read csv with pandas # read csv
df = pd.read_csv(timezone_file, names=timezone_names) with open(timezone_file, newline='') as csvfile:
reader = DictReader(csvfile, fieldnames=timezone_names)
csv = [row for row in reader]
# drop all columns except load_columns # drop all columns except load_columns
df = df[load_columns] csv = [{k: v for k, v in row.items() if k in load_columns} for row in csv]
# distinct zone_names # distinct zone_names
df = df.drop_duplicates(subset=["zone_name"]) seen = set()
unique_csv = []
for row in csv:
if row["zone_name"] not in seen:
seen.add(row["zone_name"])
unique_csv.append(row)
csv = unique_csv
# reset index return csv
df = df.reset_index(drop=True)
return df
def main(): def main():
# read csv with pandas # read csv file and load timezones and countries
df_file = load_tz_file() csv = load_tz_file()
# split zone_name into components by "/" # split zone_name into components by "/"
df_file[['region', 'city']] = df_file['zone_name'].str.split( for row in csv:
'/', expand=True, n=1) parts = row["zone_name"].split("/", 1)
row["region"] = parts[0]
row["city"] = parts[1] if len(parts) > 1 else None
# drop regions with no country_code (like Etc, GMT, etc) # drop regions with no country_code (like Etc, GMT, etc)
df_file = df_file[df_file['country_code'].notna()] csv = [row for row in csv if row["country_code"]]
# get all timezones from pytz and split into region and city
tz = [{"zone_name": tz} for tz in pytz.all_timezones]
df_tz = pd.DataFrame(pytz.all_timezones)
# rename column to zone_name
df_tz = df_tz.rename(columns={0: 'zone_name'})
# split zone_name into components by "/" # split zone_name into components by "/"
df_tz[['region', 'city']] = df_tz['zone_name'].str.split( for row in tz:
'/', expand=True, n=1) parts = row["zone_name"].split("/", 1)
row["region"] = parts[0]
row["city"] = parts[1] if len(parts) > 1 else None
# drop regions with no city (like UTC, GMT, etc) # drop regions with no city (like UTC, GMT, etc)
df_tz = df_tz[df_tz['city'].notna()] tz = [row for row in tz if row["city"]]
# drop rows where region is 'Etc' # drop rows where region is 'Etc'
df_tz = df_tz[df_tz['region'] != 'Etc'] tz = [row for row in tz if row["region"] != "Etc"]
# join data on region and city
timezones = []
for tz_row in tz:
for csv_row in csv:
if tz_row["region"] == csv_row["region"] and tz_row["city"] == csv_row["city"]:
timezones.append({
"zone_name": tz_row["zone_name"],
"country_code": csv_row["country_code"],
"region": tz_row["region"],
"city": tz_row["city"],
})
break
# join dataframes on region and city
df_merged = pd.merge(df_file, df_tz, on=[
'region', 'city'], how='inner', indicator=True)
# reorder columns # reorder columns
df_merged = df_merged[['region', 'city', 'country_code']] timezones = [{k: row[k] for k in ['region', 'city', 'country_code']}
# print merged dataframe for row in timezones]
print(f"Merged timezones: {len(df_merged)}")
print(df_merged.sample(20).to_string(index=False)) # print merged data
regions = df_merged['region'].unique() print(f"Merged timezones: {len(timezones)}")
print(timezones[:20])
regions = set(row['region'] for row in timezones)
for region in regions: for region in regions:
df_region = df_merged[df_merged['region'] == region] df_region = [row for row in timezones if row['region'] == region]
print(f"{len(df_region)} merged in {region}") print(f"{len(df_region)} merged in {region}")