import pandas as pd from sqlalchemy import create_engine, text from web.utils import get_mysql_config def get_engine(): cfg = get_mysql_config() url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}?charset=utf8mb4" return create_engine(url, future=True) def get_all_jobs(): query = """ SELECT l.job_id ,l.title ,d.description ,l.region ,l.keyword ,d.company ,l.location ,l.timestamp ,d.posted_time ,l.url ,c.file_path ,c.last_modified ,c.url_guess ,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale FROM job_listings AS l INNER JOIN job_descriptions AS d ON l.job_id = d.job_id AND l.url = d.url LEFT JOIN cached_pages AS c ON l.job_id = c.job_id ORDER BY d.posted_time DESC """ engine = get_engine() with engine.begin() as conn: rows = conn.execute(text(query)).fetchall() return [ { "job_id": row[0], "title": row[1], "description": row[2], "region": row[3], "keyword": row[4], "company": row[5], "location": row[6], "timestamp": row[7], "posted_time": row[8], "url": row[9], "file_path": row[10], "last_modified": row[11], "url_guess": row[12], "url_guess_stale": row[13], } for row in rows ] def main(): """Main function to load and display job postings.""" jobs_df = pd.DataFrame(get_all_jobs()) print(jobs_df.head()) print(f"Total postings: {len(jobs_df)}") print("Regions:") print(jobs_df['region'].value_counts()) print("Keywords:") print(jobs_df['keyword'].value_counts()) # print("Sample Job Postings:") # print("-" * 40) # for sample in jobs_df[['region', 'keyword', 'title', 'location', 'description']].sample(5).itertuples(): # print( # f"Region: {sample.region}, Keyword: {sample.keyword}, Title: {sample.title}, Location: {sample.location}") # print(sample.description) # print("-" * 40) if __name__ == "__main__": main()