refactor: replace pandas with csv module for timezone data loading
This commit is contained in:
+52
-30
@@ -1,6 +1,6 @@
|
||||
import pytz
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
from csv import DictReader
|
||||
|
||||
|
||||
def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None:
|
||||
@@ -41,52 +41,74 @@ def load_tz_file():
|
||||
"abbreviation", "time_start", "gmt_offset", "dst"]
|
||||
# columns to load
|
||||
load_columns = ["zone_name", "country_code"]
|
||||
# read csv with pandas
|
||||
df = pd.read_csv(timezone_file, names=timezone_names)
|
||||
# read csv
|
||||
with open(timezone_file, newline='') as csvfile:
|
||||
reader = DictReader(csvfile, fieldnames=timezone_names)
|
||||
csv = [row for row in reader]
|
||||
# drop all columns except load_columns
|
||||
df = df[load_columns]
|
||||
csv = [{k: v for k, v in row.items() if k in load_columns} for row in csv]
|
||||
# distinct zone_names
|
||||
df = df.drop_duplicates(subset=["zone_name"])
|
||||
seen = set()
|
||||
unique_csv = []
|
||||
for row in csv:
|
||||
if row["zone_name"] not in seen:
|
||||
seen.add(row["zone_name"])
|
||||
unique_csv.append(row)
|
||||
csv = unique_csv
|
||||
|
||||
# reset index
|
||||
df = df.reset_index(drop=True)
|
||||
|
||||
return df
|
||||
return csv
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
# read csv with pandas
|
||||
df_file = load_tz_file()
|
||||
# read csv file and load timezones and countries
|
||||
csv = load_tz_file()
|
||||
|
||||
# split zone_name into components by "/"
|
||||
df_file[['region', 'city']] = df_file['zone_name'].str.split(
|
||||
'/', expand=True, n=1)
|
||||
for row in csv:
|
||||
parts = row["zone_name"].split("/", 1)
|
||||
row["region"] = parts[0]
|
||||
row["city"] = parts[1] if len(parts) > 1 else None
|
||||
# drop regions with no country_code (like Etc, GMT, etc)
|
||||
df_file = df_file[df_file['country_code'].notna()]
|
||||
csv = [row for row in csv if row["country_code"]]
|
||||
|
||||
# get all timezones from pytz and split into region and city
|
||||
|
||||
tz = [{"zone_name": tz} for tz in pytz.all_timezones]
|
||||
|
||||
df_tz = pd.DataFrame(pytz.all_timezones)
|
||||
# rename column to zone_name
|
||||
df_tz = df_tz.rename(columns={0: 'zone_name'})
|
||||
# split zone_name into components by "/"
|
||||
df_tz[['region', 'city']] = df_tz['zone_name'].str.split(
|
||||
'/', expand=True, n=1)
|
||||
for row in tz:
|
||||
parts = row["zone_name"].split("/", 1)
|
||||
row["region"] = parts[0]
|
||||
row["city"] = parts[1] if len(parts) > 1 else None
|
||||
# drop regions with no city (like UTC, GMT, etc)
|
||||
df_tz = df_tz[df_tz['city'].notna()]
|
||||
tz = [row for row in tz if row["city"]]
|
||||
# drop rows where region is 'Etc'
|
||||
df_tz = df_tz[df_tz['region'] != 'Etc']
|
||||
tz = [row for row in tz if row["region"] != "Etc"]
|
||||
|
||||
# join data on region and city
|
||||
timezones = []
|
||||
for tz_row in tz:
|
||||
for csv_row in csv:
|
||||
if tz_row["region"] == csv_row["region"] and tz_row["city"] == csv_row["city"]:
|
||||
timezones.append({
|
||||
"zone_name": tz_row["zone_name"],
|
||||
"country_code": csv_row["country_code"],
|
||||
"region": tz_row["region"],
|
||||
"city": tz_row["city"],
|
||||
})
|
||||
break
|
||||
|
||||
# join dataframes on region and city
|
||||
df_merged = pd.merge(df_file, df_tz, on=[
|
||||
'region', 'city'], how='inner', indicator=True)
|
||||
# reorder columns
|
||||
df_merged = df_merged[['region', 'city', 'country_code']]
|
||||
# print merged dataframe
|
||||
print(f"Merged timezones: {len(df_merged)}")
|
||||
print(df_merged.sample(20).to_string(index=False))
|
||||
regions = df_merged['region'].unique()
|
||||
timezones = [{k: row[k] for k in ['region', 'city', 'country_code']}
|
||||
for row in timezones]
|
||||
|
||||
# print merged data
|
||||
print(f"Merged timezones: {len(timezones)}")
|
||||
print(timezones[:20])
|
||||
regions = set(row['region'] for row in timezones)
|
||||
for region in regions:
|
||||
df_region = df_merged[df_merged['region'] == region]
|
||||
df_region = [row for row in timezones if row['region'] == region]
|
||||
print(f"{len(df_region)} merged in {region}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user