refactor: replace pandas with csv module for timezone data loading

This commit is contained in:
2026-05-10 12:43:11 +02:00
parent 584231b0df
commit 8f8c3655db
+52 -30
View File
@@ -1,6 +1,6 @@
import pytz
from datetime import datetime
import pandas as pd
from csv import DictReader
def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None:
@@ -41,52 +41,74 @@ def load_tz_file():
"abbreviation", "time_start", "gmt_offset", "dst"]
# columns to load
load_columns = ["zone_name", "country_code"]
# read csv with pandas
df = pd.read_csv(timezone_file, names=timezone_names)
# read csv
with open(timezone_file, newline='') as csvfile:
reader = DictReader(csvfile, fieldnames=timezone_names)
csv = [row for row in reader]
# drop all columns except load_columns
df = df[load_columns]
csv = [{k: v for k, v in row.items() if k in load_columns} for row in csv]
# distinct zone_names
df = df.drop_duplicates(subset=["zone_name"])
seen = set()
unique_csv = []
for row in csv:
if row["zone_name"] not in seen:
seen.add(row["zone_name"])
unique_csv.append(row)
csv = unique_csv
# reset index
df = df.reset_index(drop=True)
return df
return csv
def main():
# read csv with pandas
df_file = load_tz_file()
# read csv file and load timezones and countries
csv = load_tz_file()
# split zone_name into components by "/"
df_file[['region', 'city']] = df_file['zone_name'].str.split(
'/', expand=True, n=1)
for row in csv:
parts = row["zone_name"].split("/", 1)
row["region"] = parts[0]
row["city"] = parts[1] if len(parts) > 1 else None
# drop regions with no country_code (like Etc, GMT, etc)
df_file = df_file[df_file['country_code'].notna()]
csv = [row for row in csv if row["country_code"]]
# get all timezones from pytz and split into region and city
tz = [{"zone_name": tz} for tz in pytz.all_timezones]
df_tz = pd.DataFrame(pytz.all_timezones)
# rename column to zone_name
df_tz = df_tz.rename(columns={0: 'zone_name'})
# split zone_name into components by "/"
df_tz[['region', 'city']] = df_tz['zone_name'].str.split(
'/', expand=True, n=1)
for row in tz:
parts = row["zone_name"].split("/", 1)
row["region"] = parts[0]
row["city"] = parts[1] if len(parts) > 1 else None
# drop regions with no city (like UTC, GMT, etc)
df_tz = df_tz[df_tz['city'].notna()]
tz = [row for row in tz if row["city"]]
# drop rows where region is 'Etc'
df_tz = df_tz[df_tz['region'] != 'Etc']
tz = [row for row in tz if row["region"] != "Etc"]
# join data on region and city
timezones = []
for tz_row in tz:
for csv_row in csv:
if tz_row["region"] == csv_row["region"] and tz_row["city"] == csv_row["city"]:
timezones.append({
"zone_name": tz_row["zone_name"],
"country_code": csv_row["country_code"],
"region": tz_row["region"],
"city": tz_row["city"],
})
break
# join dataframes on region and city
df_merged = pd.merge(df_file, df_tz, on=[
'region', 'city'], how='inner', indicator=True)
# reorder columns
df_merged = df_merged[['region', 'city', 'country_code']]
# print merged dataframe
print(f"Merged timezones: {len(df_merged)}")
print(df_merged.sample(20).to_string(index=False))
regions = df_merged['region'].unique()
timezones = [{k: row[k] for k in ['region', 'city', 'country_code']}
for row in timezones]
# print merged data
print(f"Merged timezones: {len(timezones)}")
print(timezones[:20])
regions = set(row['region'] for row in timezones)
for region in regions:
df_region = df_merged[df_merged['region'] == region]
df_region = [row for row in timezones if row['region'] == region]
print(f"{len(df_region)} merged in {region}")