refactor: replace pandas with csv module for timezone data loading
This commit is contained in:
+52
-30
@@ -1,6 +1,6 @@
|
|||||||
import pytz
|
import pytz
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import pandas as pd
|
from csv import DictReader
|
||||||
|
|
||||||
|
|
||||||
def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None:
|
def get_tz_info(tz_name: str, timezones: list[dict]) -> dict | None:
|
||||||
@@ -41,52 +41,74 @@ def load_tz_file():
|
|||||||
"abbreviation", "time_start", "gmt_offset", "dst"]
|
"abbreviation", "time_start", "gmt_offset", "dst"]
|
||||||
# columns to load
|
# columns to load
|
||||||
load_columns = ["zone_name", "country_code"]
|
load_columns = ["zone_name", "country_code"]
|
||||||
# read csv with pandas
|
# read csv
|
||||||
df = pd.read_csv(timezone_file, names=timezone_names)
|
with open(timezone_file, newline='') as csvfile:
|
||||||
|
reader = DictReader(csvfile, fieldnames=timezone_names)
|
||||||
|
csv = [row for row in reader]
|
||||||
# drop all columns except load_columns
|
# drop all columns except load_columns
|
||||||
df = df[load_columns]
|
csv = [{k: v for k, v in row.items() if k in load_columns} for row in csv]
|
||||||
# distinct zone_names
|
# distinct zone_names
|
||||||
df = df.drop_duplicates(subset=["zone_name"])
|
seen = set()
|
||||||
|
unique_csv = []
|
||||||
|
for row in csv:
|
||||||
|
if row["zone_name"] not in seen:
|
||||||
|
seen.add(row["zone_name"])
|
||||||
|
unique_csv.append(row)
|
||||||
|
csv = unique_csv
|
||||||
|
|
||||||
# reset index
|
return csv
|
||||||
df = df.reset_index(drop=True)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
# read csv with pandas
|
# read csv file and load timezones and countries
|
||||||
df_file = load_tz_file()
|
csv = load_tz_file()
|
||||||
|
|
||||||
# split zone_name into components by "/"
|
# split zone_name into components by "/"
|
||||||
df_file[['region', 'city']] = df_file['zone_name'].str.split(
|
for row in csv:
|
||||||
'/', expand=True, n=1)
|
parts = row["zone_name"].split("/", 1)
|
||||||
|
row["region"] = parts[0]
|
||||||
|
row["city"] = parts[1] if len(parts) > 1 else None
|
||||||
# drop regions with no country_code (like Etc, GMT, etc)
|
# drop regions with no country_code (like Etc, GMT, etc)
|
||||||
df_file = df_file[df_file['country_code'].notna()]
|
csv = [row for row in csv if row["country_code"]]
|
||||||
|
|
||||||
|
# get all timezones from pytz and split into region and city
|
||||||
|
|
||||||
|
tz = [{"zone_name": tz} for tz in pytz.all_timezones]
|
||||||
|
|
||||||
df_tz = pd.DataFrame(pytz.all_timezones)
|
|
||||||
# rename column to zone_name
|
|
||||||
df_tz = df_tz.rename(columns={0: 'zone_name'})
|
|
||||||
# split zone_name into components by "/"
|
# split zone_name into components by "/"
|
||||||
df_tz[['region', 'city']] = df_tz['zone_name'].str.split(
|
for row in tz:
|
||||||
'/', expand=True, n=1)
|
parts = row["zone_name"].split("/", 1)
|
||||||
|
row["region"] = parts[0]
|
||||||
|
row["city"] = parts[1] if len(parts) > 1 else None
|
||||||
# drop regions with no city (like UTC, GMT, etc)
|
# drop regions with no city (like UTC, GMT, etc)
|
||||||
df_tz = df_tz[df_tz['city'].notna()]
|
tz = [row for row in tz if row["city"]]
|
||||||
# drop rows where region is 'Etc'
|
# drop rows where region is 'Etc'
|
||||||
df_tz = df_tz[df_tz['region'] != 'Etc']
|
tz = [row for row in tz if row["region"] != "Etc"]
|
||||||
|
|
||||||
|
# join data on region and city
|
||||||
|
timezones = []
|
||||||
|
for tz_row in tz:
|
||||||
|
for csv_row in csv:
|
||||||
|
if tz_row["region"] == csv_row["region"] and tz_row["city"] == csv_row["city"]:
|
||||||
|
timezones.append({
|
||||||
|
"zone_name": tz_row["zone_name"],
|
||||||
|
"country_code": csv_row["country_code"],
|
||||||
|
"region": tz_row["region"],
|
||||||
|
"city": tz_row["city"],
|
||||||
|
})
|
||||||
|
break
|
||||||
|
|
||||||
# join dataframes on region and city
|
|
||||||
df_merged = pd.merge(df_file, df_tz, on=[
|
|
||||||
'region', 'city'], how='inner', indicator=True)
|
|
||||||
# reorder columns
|
# reorder columns
|
||||||
df_merged = df_merged[['region', 'city', 'country_code']]
|
timezones = [{k: row[k] for k in ['region', 'city', 'country_code']}
|
||||||
# print merged dataframe
|
for row in timezones]
|
||||||
print(f"Merged timezones: {len(df_merged)}")
|
|
||||||
print(df_merged.sample(20).to_string(index=False))
|
# print merged data
|
||||||
regions = df_merged['region'].unique()
|
print(f"Merged timezones: {len(timezones)}")
|
||||||
|
print(timezones[:20])
|
||||||
|
regions = set(row['region'] for row in timezones)
|
||||||
for region in regions:
|
for region in regions:
|
||||||
df_region = df_merged[df_merged['region'] == region]
|
df_region = [row for row in timezones if row['region'] == region]
|
||||||
print(f"{len(df_region)} merged in {region}")
|
print(f"{len(df_region)} merged in {region}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user