{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!pip3 install python-docx\n", "#!pip3 install openai\n", "#!pip3 install spacy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os # For file path operations\n", "import re # For regular expressions (finding keywords)\n", "import requests # For making HTTP requests to fetch job description\n", "from docx import Document # From python-docx for reading/writing Word documents\n", "from docx.shared import Pt # For setting font sizes, etc.\n", "import time\n", "import datetime\n", "import pandas as pd\n", "\n", "import spacy\n", "from datetime import datetime, timedelta\n", "import undetected_chromedriver as uc\n", "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.common.keys import Keys\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Getting the job from linked in then put the file as input file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "🔍 Scraping LinkedIn Jobs...\n", "\n", "⏳ Skipping job: (Posted 15 days ago)\n", "⏳ Skipping job: ******* ******* *********** ********* - *-******** (*/*/*) (Posted 115 days ago)\n", "\n", "✅ Jobs saved to /Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx\n" ] } ], "source": [ "def configure_driver():\n", " options = webdriver.ChromeOptions()\n", " options.add_argument(\"--headless\")\n", " options.add_argument(\"--no-sandbox\")\n", " options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = uc.Chrome(options=options)\n", " return driver\n", "\n", "def scrape_linkedin_jobs(keyword, location):\n", " print(\"\\n🔍 Scraping LinkedIn Jobs...\\n\")\n", " driver = configure_driver()\n", " search_url = f\"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}\"\n", " driver.get(search_url)\n", " \n", " for _ in range(3): \n", " driver.execute_script(\"window.scrollBy(0, 800);\")\n", " time.sleep(2)\n", " \n", " wait = WebDriverWait(driver, 15)\n", " try:\n", " wait.until(EC.presence_of_element_located((By.CLASS_NAME, \"base-card\")))\n", " except:\n", " print(\"❌ No LinkedIn jobs found.\")\n", " driver.quit()\n", " return []\n", "\n", " jobs = []\n", " today = datetime.today()\n", " job_elements = driver.find_elements(By.CLASS_NAME, \"base-card\")\n", " \n", " for job in job_elements[:10]:\n", " try:\n", " title = job.find_element(By.CSS_SELECTOR, \"h3\").text.strip()\n", " company = job.find_element(By.CSS_SELECTOR, \"h4\").text.strip()\n", " link = job.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n", " \n", " try:\n", " date_element = job.find_element(By.CSS_SELECTOR, \"time\")\n", " posted_time = date_element.get_attribute(\"datetime\")\n", " if posted_time:\n", " posted_date = datetime.strptime(posted_time[:10], \"%Y-%m-%d\")\n", " days_ago = (today - posted_date).days\n", " if days_ago > 14:\n", " print(f\"⏳ Skipping job: {title} (Posted {days_ago} days ago)\")\n", " continue\n", " except:\n", " print(f\"⚠️ Could not find post time for: {title}, assuming it's recent.\")\n", " days_ago = \"Unknown\"\n", " \n", " jobs.append({\"title\": title, \"company\": company, \"link\": link, \"source\": \"LinkedIn\", \"posted_days_ago\": days_ago})\n", " except Exception as e:\n", " print(f\"⚠️ Skipping a job entry due to error: {e}\")\n", " continue\n", " \n", " driver.quit()\n", " return jobs\n", "\n", "if __name__ == \"__main__\":\n", " keyword = input(\"Enter job title (e.g., Software Engineer): \")\n", " location = input(\"Enter location (e.g., Remote, New York, Berlin): \")\n", " \n", " linkedin_jobs = scrape_linkedin_jobs(keyword, location)\n", " \n", " if linkedin_jobs:\n", " df = pd.DataFrame(linkedin_jobs)\n", " today_date = datetime.today().strftime(\"%Y-%m-%d\")\n", " filename = f\"linkedin_jobs_{today_date}.xlsx\"\n", " \n", " folder_path = \"/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker\"\n", " os.makedirs(folder_path, exist_ok=True) # Ensure directory exists\n", " input_file = os.path.join(folder_path, filename)\n", " \n", " df.to_excel(input_file, index=False)\n", " print(f\"\\n✅ Jobs saved to {input_file}\")\n", " else:\n", " print(\"\\n❌ No LinkedIn jobs found.\")\n", " input_file = None\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx\n" ] } ], "source": [ "#chekcing the input file is getting correctly\n", "print(input_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting the description of the job. fetch_full_job_details" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def fetch_full_job_details(job_url: str) -> tuple:\n", " \"\"\"\n", " Opens the LinkedIn job page, fetches the job title, company name, and full job description.\n", " Returns (job_title, company_name, job_description).\n", " \"\"\"\n", " options = uc.ChromeOptions()\n", " options.add_argument(\"--headless\") # or remove this if you want to see the browser\n", " options.add_argument(\"--no-sandbox\")\n", " options.add_argument(\"--disable-dev-shm-usage\")\n", "\n", " driver = uc.Chrome(options=options)\n", " driver.get(job_url)\n", "\n", " # Default empty values\n", " job_title = \"\"\n", " company_name = \"\"\n", " job_description = \"\"\n", "\n", " try:\n", " wait = WebDriverWait(driver, 15)\n", "\n", " # 1) Job Title (example selector)\n", " title_elem = wait.until(\n", " EC.presence_of_element_located((By.CSS_SELECTOR, \"h1.topcard__title\"))\n", " )\n", " job_title = title_elem.get_attribute(\"innerText\")\n", "\n", " # 2) Company Name (example selector)\n", " company_elem = wait.until(\n", " EC.presence_of_element_located((By.CSS_SELECTOR, \"a.topcard__org-name-link\"))\n", " )\n", " company_name = company_elem.get_attribute(\"innerText\")\n", "\n", " # 3) Full Job Description (often \"description__text\" class)\n", " desc_elem = wait.until(\n", " EC.presence_of_element_located((By.CLASS_NAME, \"description__text\"))\n", " )\n", " job_description = desc_elem.get_attribute(\"innerText\")\n", "\n", " except Exception as e:\n", " print(f\"Error scraping {job_url}: {e}\")\n", "\n", " finally:\n", " driver.quit()\n", "\n", " return job_title.strip(), company_name.strip(), job_description.strip()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### After modifying the excel sheet with description" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " title \\\n", "0 Working Student - Digital Analytics (all genders) \n", "1 Working Student Graphic Design (m/w/d) \n", "2 Werkstudent \n", "3 Working Student Corporate and Business Develop... \n", "4 Working Student in Product Marketing \n", "\n", " company \\\n", "0 Digitl \n", "1 Fanblast \n", "2 DDC Management Consultants \n", "3 PRIOjet GmbH \n", "4 Rabot Energy \n", "\n", " link source \\\n", "0 https://de.linkedin.com/jobs/view/working-stud... LinkedIn \n", "1 https://de.linkedin.com/jobs/view/working-stud... LinkedIn \n", "2 https://de.linkedin.com/jobs/view/werkstudent-... LinkedIn \n", "3 https://de.linkedin.com/jobs/view/working-stud... LinkedIn \n", "4 https://de.linkedin.com/jobs/view/working-stud... LinkedIn \n", "\n", " posted_days_ago description \\\n", "0 5 Digitl ist ein junges und innovatives Unterneh... \n", "1 8 About fanblast\\n\\nFanblast is a fast-growing S... \n", "2 6 DDC Management Consultants ist als Management-... \n", "3 6 Hi! We’re happy that you’re here.\\n\\n\\n\\n\\nPRI... \n", "4 13 Why us?\\n\\nRABOT Energy is looking for a motiv... \n", "\n", " Extracted Skills Extracted Requirements \n", "0 [] [] \n", "1 [] [required, ability to, proficiency in, experie... \n", "2 [] [] \n", "3 [] [required, experience in] \n", "4 [] [experience in] \n" ] } ], "source": [ "import pandas as pd\n", "import re\n", "\n", "# Load the updated Excel file\n", "input_file = \"final_job_descriptions.xlsx\"\n", "df = pd.read_excel(input_file)\n", "\n", "# Define your existing skills and categories\n", "my_skills = {\n", " \"Data Science & Machine Learning\": [\"Python\", \"R\", \"TensorFlow\", \"NumPy\", \"Pandas\", \"Seaborn\", \"Scikit-learn\"],\n", " \"Statistical Modeling & AI\": [\"ML models\", \"AI\", \"Custom-GPT\", \"Deep Learning\"],\n", " \"AI Agent\": [\"n8n\", \"Python AI Agent\", \"Automation\"],\n", " \"Business Intelligence & Dashboarding\": [\"Power BI\", \"Tableau\", \"SQL\", \"Data Visualization\"],\n", " \"Database Optimization\": [\"SQL\", \"MySQL\", \"PostgreSQL\"],\n", " \"Programming Languages\": [\"Python\", \"Java\", \"C\", \"JavaScript\"],\n", " \"Microsoft Tools\": [\"Azure\", \"Microsoft 365\", \"Dynamics 365\"]\n", "}\n", "\n", "# Common requirement phrases\n", "requirement_keywords = [\"experience in\", \"knowledge of\", \"proficiency in\", \"familiarity with\", \"required\", \"preferred\", \"must have\", \"ability to\"]\n", "\n", "def extract_skills_and_requirements(description):\n", " \"\"\"\n", " Extracts relevant skills and job requirements from the job description\n", " based on predefined skills and requirement keywords.\n", " \"\"\"\n", " description = description.lower() # Convert to lowercase for easier matching\n", "\n", " # Identify matching skills\n", " matched_skills = set()\n", " for category, skills in my_skills.items():\n", " for skill in skills:\n", " pattern = rf\"\\b{re.escape(skill.lower())}\\b\"\n", " if re.search(pattern, description):\n", " matched_skills.add(skill)\n", "\n", " # Extract job requirements based on common keywords\n", " matched_requirements = set()\n", " for keyword in requirement_keywords:\n", " if keyword in description:\n", " matched_requirements.add(keyword)\n", "\n", " return list(matched_skills), list(matched_requirements)\n", "\n", "def process_job_descriptions(df, desc_col=\"description\", title_col=\"title\"):\n", " \"\"\"\n", " Extracts skills and requirements from job descriptions and stores them in the DataFrame.\n", " \"\"\"\n", " skills_list = []\n", " requirements_list = []\n", "\n", " for idx, row in df.iterrows():\n", " description_text = str(row.get(desc_col, \"\"))\n", " job_title = str(row.get(title_col, \"No Title Provided\"))\n", "\n", " if not description_text.strip():\n", " skills_list.append([])\n", " requirements_list.append([])\n", " continue\n", " \n", " matched_skills, matched_requirements = extract_skills_and_requirements(description_text)\n", " skills_list.append(matched_skills)\n", " requirements_list.append(matched_requirements)\n", "\n", " df[\"Extracted Skills\"] = skills_list\n", " df[\"Extracted Requirements\"] = requirements_list\n", " return df\n", "\n", "# Process the job descriptions and display results\n", "df = process_job_descriptions(df)\n", "print(df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Getting some keywords" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "CV_R", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }