{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip3 install python-docx\n",
    "#!pip3 install openai\n",
    "#!pip3 install spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os       # For file path operations\n",
    "import re       # For regular expressions (finding keywords)\n",
    "import requests # For making HTTP requests to fetch job description\n",
    "from docx import Document     # From python-docx for reading/writing Word documents\n",
    "from docx.shared import Pt    # For setting font sizes, etc.\n",
    "import time\n",
    "import datetime\n",
    "import pandas as pd\n",
    "\n",
    "import spacy\n",
    "from datetime import datetime, timedelta\n",
    "import undetected_chromedriver as uc\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.common.keys import Keys\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Getting the job from linked in then put the file as input file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "🔍 Scraping LinkedIn Jobs...\n",
      "\n",
      "⏳ Skipping job:  (Posted 15 days ago)\n",
      "⏳ Skipping job: ******* ******* *********** ********* - *-******** (*/*/*) (Posted 115 days ago)\n",
      "\n",
      "✅ Jobs saved to /Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx\n"
     ]
    }
   ],
   "source": [
    "def configure_driver():\n",
    "    options = webdriver.ChromeOptions()\n",
    "    options.add_argument(\"--headless\")\n",
    "    options.add_argument(\"--no-sandbox\")\n",
    "    options.add_argument(\"--disable-dev-shm-usage\")\n",
    "    driver = uc.Chrome(options=options)\n",
    "    return driver\n",
    "\n",
    "def scrape_linkedin_jobs(keyword, location):\n",
    "    print(\"\\n🔍 Scraping LinkedIn Jobs...\\n\")\n",
    "    driver = configure_driver()\n",
    "    search_url = f\"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}\"\n",
    "    driver.get(search_url)\n",
    "    \n",
    "    for _ in range(3):  \n",
    "        driver.execute_script(\"window.scrollBy(0, 800);\")\n",
    "        time.sleep(2)\n",
    "    \n",
    "    wait = WebDriverWait(driver, 15)\n",
    "    try:\n",
    "        wait.until(EC.presence_of_element_located((By.CLASS_NAME, \"base-card\")))\n",
    "    except:\n",
    "        print(\"❌ No LinkedIn jobs found.\")\n",
    "        driver.quit()\n",
    "        return []\n",
    "\n",
    "    jobs = []\n",
    "    today = datetime.today()\n",
    "    job_elements = driver.find_elements(By.CLASS_NAME, \"base-card\")\n",
    "    \n",
    "    for job in job_elements[:10]:\n",
    "        try:\n",
    "            title = job.find_element(By.CSS_SELECTOR, \"h3\").text.strip()\n",
    "            company = job.find_element(By.CSS_SELECTOR, \"h4\").text.strip()\n",
    "            link = job.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n",
    "            \n",
    "            try:\n",
    "                date_element = job.find_element(By.CSS_SELECTOR, \"time\")\n",
    "                posted_time = date_element.get_attribute(\"datetime\")\n",
    "                if posted_time:\n",
    "                    posted_date = datetime.strptime(posted_time[:10], \"%Y-%m-%d\")\n",
    "                    days_ago = (today - posted_date).days\n",
    "                    if days_ago > 14:\n",
    "                        print(f\"⏳ Skipping job: {title} (Posted {days_ago} days ago)\")\n",
    "                        continue\n",
    "            except:\n",
    "                print(f\"⚠️ Could not find post time for: {title}, assuming it's recent.\")\n",
    "                days_ago = \"Unknown\"\n",
    "            \n",
    "            jobs.append({\"title\": title, \"company\": company, \"link\": link, \"source\": \"LinkedIn\", \"posted_days_ago\": days_ago})\n",
    "        except Exception as e:\n",
    "            print(f\"⚠️ Skipping a job entry due to error: {e}\")\n",
    "            continue\n",
    "    \n",
    "    driver.quit()\n",
    "    return jobs\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    keyword = input(\"Enter job title (e.g., Software Engineer): \")\n",
    "    location = input(\"Enter location (e.g., Remote, New York, Berlin): \")\n",
    "    \n",
    "    linkedin_jobs = scrape_linkedin_jobs(keyword, location)\n",
    "    \n",
    "    if linkedin_jobs:\n",
    "        df = pd.DataFrame(linkedin_jobs)\n",
    "        today_date = datetime.today().strftime(\"%Y-%m-%d\")\n",
    "        filename = f\"linkedin_jobs_{today_date}.xlsx\"\n",
    "        \n",
    "        folder_path = \"/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker\"\n",
    "        os.makedirs(folder_path, exist_ok=True)  # Ensure directory exists\n",
    "        input_file = os.path.join(folder_path, filename)\n",
    "        \n",
    "        df.to_excel(input_file, index=False)\n",
    "        print(f\"\\n✅ Jobs saved to {input_file}\")\n",
    "    else:\n",
    "        print(\"\\n❌ No LinkedIn jobs found.\")\n",
    "        input_file = None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx\n"
     ]
    }
   ],
   "source": [
    "#chekcing the input file is getting correctly\n",
    "print(input_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting the description of the job. fetch_full_job_details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_full_job_details(job_url: str) -> tuple:\n",
    "    \"\"\"\n",
    "    Opens the LinkedIn job page, fetches the job title, company name, and full job description.\n",
    "    Returns (job_title, company_name, job_description).\n",
    "    \"\"\"\n",
    "    options = uc.ChromeOptions()\n",
    "    options.add_argument(\"--headless\")           # or remove this if you want to see the browser\n",
    "    options.add_argument(\"--no-sandbox\")\n",
    "    options.add_argument(\"--disable-dev-shm-usage\")\n",
    "\n",
    "    driver = uc.Chrome(options=options)\n",
    "    driver.get(job_url)\n",
    "\n",
    "    # Default empty values\n",
    "    job_title = \"\"\n",
    "    company_name = \"\"\n",
    "    job_description = \"\"\n",
    "\n",
    "    try:\n",
    "        wait = WebDriverWait(driver, 15)\n",
    "\n",
    "        # 1) Job Title (example selector)\n",
    "        title_elem = wait.until(\n",
    "            EC.presence_of_element_located((By.CSS_SELECTOR, \"h1.topcard__title\"))\n",
    "        )\n",
    "        job_title = title_elem.get_attribute(\"innerText\")\n",
    "\n",
    "        # 2) Company Name (example selector)\n",
    "        company_elem = wait.until(\n",
    "            EC.presence_of_element_located((By.CSS_SELECTOR, \"a.topcard__org-name-link\"))\n",
    "        )\n",
    "        company_name = company_elem.get_attribute(\"innerText\")\n",
    "\n",
    "        # 3) Full Job Description (often \"description__text\" class)\n",
    "        desc_elem = wait.until(\n",
    "            EC.presence_of_element_located((By.CLASS_NAME, \"description__text\"))\n",
    "        )\n",
    "        job_description = desc_elem.get_attribute(\"innerText\")\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"Error scraping {job_url}: {e}\")\n",
    "\n",
    "    finally:\n",
    "        driver.quit()\n",
    "\n",
    "    return job_title.strip(), company_name.strip(), job_description.strip()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### After modifying the excel sheet with description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                               title  \\\n",
      "0  Working Student - Digital Analytics (all genders)   \n",
      "1             Working Student Graphic Design (m/w/d)   \n",
      "2                                        Werkstudent   \n",
      "3  Working Student Corporate and Business Develop...   \n",
      "4               Working Student in Product Marketing   \n",
      "\n",
      "                      company  \\\n",
      "0                      Digitl   \n",
      "1                    Fanblast   \n",
      "2  DDC Management Consultants   \n",
      "3                PRIOjet GmbH   \n",
      "4                Rabot Energy   \n",
      "\n",
      "                                                link    source  \\\n",
      "0  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   \n",
      "1  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   \n",
      "2  https://de.linkedin.com/jobs/view/werkstudent-...  LinkedIn   \n",
      "3  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   \n",
      "4  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   \n",
      "\n",
      "   posted_days_ago                                        description  \\\n",
      "0                5  Digitl ist ein junges und innovatives Unterneh...   \n",
      "1                8  About fanblast\\n\\nFanblast is a fast-growing S...   \n",
      "2                6  DDC Management Consultants ist als Management-...   \n",
      "3                6  Hi! We’re happy that you’re here.\\n\\n\\n\\n\\nPRI...   \n",
      "4               13  Why us?\\n\\nRABOT Energy is looking for a motiv...   \n",
      "\n",
      "  Extracted Skills                             Extracted Requirements  \n",
      "0               []                                                 []  \n",
      "1               []  [required, ability to, proficiency in, experie...  \n",
      "2               []                                                 []  \n",
      "3               []                          [required, experience in]  \n",
      "4               []                                    [experience in]  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "# Load the updated Excel file\n",
    "input_file = \"final_job_descriptions.xlsx\"\n",
    "df = pd.read_excel(input_file)\n",
    "\n",
    "# Define your existing skills and categories\n",
    "my_skills = {\n",
    "    \"Data Science & Machine Learning\": [\"Python\", \"R\", \"TensorFlow\", \"NumPy\", \"Pandas\", \"Seaborn\", \"Scikit-learn\"],\n",
    "    \"Statistical Modeling & AI\": [\"ML models\", \"AI\", \"Custom-GPT\", \"Deep Learning\"],\n",
    "    \"AI Agent\": [\"n8n\", \"Python AI Agent\", \"Automation\"],\n",
    "    \"Business Intelligence & Dashboarding\": [\"Power BI\", \"Tableau\", \"SQL\", \"Data Visualization\"],\n",
    "    \"Database Optimization\": [\"SQL\", \"MySQL\", \"PostgreSQL\"],\n",
    "    \"Programming Languages\": [\"Python\", \"Java\", \"C\", \"JavaScript\"],\n",
    "    \"Microsoft Tools\": [\"Azure\", \"Microsoft 365\", \"Dynamics 365\"]\n",
    "}\n",
    "\n",
    "# Common requirement phrases\n",
    "requirement_keywords = [\"experience in\", \"knowledge of\", \"proficiency in\", \"familiarity with\", \"required\", \"preferred\", \"must have\", \"ability to\"]\n",
    "\n",
    "def extract_skills_and_requirements(description):\n",
    "    \"\"\"\n",
    "    Extracts relevant skills and job requirements from the job description\n",
    "    based on predefined skills and requirement keywords.\n",
    "    \"\"\"\n",
    "    description = description.lower()  # Convert to lowercase for easier matching\n",
    "\n",
    "    # Identify matching skills\n",
    "    matched_skills = set()\n",
    "    for category, skills in my_skills.items():\n",
    "        for skill in skills:\n",
    "            pattern = rf\"\\b{re.escape(skill.lower())}\\b\"\n",
    "            if re.search(pattern, description):\n",
    "                matched_skills.add(skill)\n",
    "\n",
    "    # Extract job requirements based on common keywords\n",
    "    matched_requirements = set()\n",
    "    for keyword in requirement_keywords:\n",
    "        if keyword in description:\n",
    "            matched_requirements.add(keyword)\n",
    "\n",
    "    return list(matched_skills), list(matched_requirements)\n",
    "\n",
    "def process_job_descriptions(df, desc_col=\"description\", title_col=\"title\"):\n",
    "    \"\"\"\n",
    "    Extracts skills and requirements from job descriptions and stores them in the DataFrame.\n",
    "    \"\"\"\n",
    "    skills_list = []\n",
    "    requirements_list = []\n",
    "\n",
    "    for idx, row in df.iterrows():\n",
    "        description_text = str(row.get(desc_col, \"\"))\n",
    "        job_title = str(row.get(title_col, \"No Title Provided\"))\n",
    "\n",
    "        if not description_text.strip():\n",
    "            skills_list.append([])\n",
    "            requirements_list.append([])\n",
    "            continue\n",
    "        \n",
    "        matched_skills, matched_requirements = extract_skills_and_requirements(description_text)\n",
    "        skills_list.append(matched_skills)\n",
    "        requirements_list.append(matched_requirements)\n",
    "\n",
    "    df[\"Extracted Skills\"] = skills_list\n",
    "    df[\"Extracted Requirements\"] = requirements_list\n",
    "    return df\n",
    "\n",
    "# Process the job descriptions and display results\n",
    "df = process_job_descriptions(df)\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Getting some keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "CV_R",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}