#!/usr/bin/env python3
"""
Scraper ultra-stealth pour extraire les détails des offres d'emploi depuis jobs.ch
Version avec gestion avancée des captchas et techniques d'évitement
"""

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import json
import csv
import time
import os
import re
import logging
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from urllib.parse import urljoin, urlparse, parse_qs
import random
from datetime import datetime
import cloudscraper
from fake_useragent import UserAgent
import ssl
import urllib3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, WebDriverException
import undetected_chrome as uc

# Désactiver les warnings SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Configuration du logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jobs_ch_stealth_scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class StealthScraperConfig:
    """Configuration du scraper stealth"""
    input_file: str
    output_csv: str
    images_dir: str
    max_retries: int = 2
    delay_range: tuple = (15, 30)  # Délais très longs
    session_reset_interval: int = 2  # Très fréquent
    timeout: int = 60
    connect_timeout: int = 30
    read_timeout: int = 60
    progress_file: str = "scraping_progress.json"
    use_selenium: bool = True  # Utiliser Selenium pour contourner les captchas
    headless: bool = True  # Mode headless par défaut
    captcha_service_key: str = ""  # Clé API pour service de résolution captcha
    max_captcha_attempts: int = 3
    enable_stealth_mode: bool = True
    user_data_dir: str = "/tmp/chrome_user_data"

class CaptchaSolver:
    """Gestionnaire de résolution de captchas"""
    
    def __init__(self, api_key: str = ""):
        self.api_key = api_key
        self.session = requests.Session()
    
    def solve_recaptcha(self, site_key: str, page_url: str) -> Optional[str]:
        """Résout un reCAPTCHA en utilisant 2captcha (nécessite une clé API)"""
        if not self.api_key:
            logger.warning("⚠️ Aucune clé API fournie pour la résolution de captcha")
            return None
        
        try:
            # Soumettre le captcha
            submit_data = {
                'method': 'userrecaptcha',
                'googlekey': site_key,
                'pageurl': page_url,
                'key': self.api_key,
                'json': 1
            }
            
            response = self.session.post('https://2captcha.com/in.php', data=submit_data)
            result = response.json()
            
            if result['status'] != 1:
                logger.error(f"❌ Erreur soumission captcha: {result}")
                return None
            
            captcha_id = result['request']
            logger.info(f"🧩 Captcha soumis, ID: {captcha_id}")
            
            # Attendre la résolution
            for attempt in range(30):  # Max 5 minutes
                time.sleep(10)
                
                check_data = {
                    'key': self.api_key,
                    'action': 'get',
                    'id': captcha_id,
                    'json': 1
                }
                
                response = self.session.get('https://2captcha.com/res.php', params=check_data)
                result = response.json()
                
                if result['status'] == 1:
                    logger.info("✅ Captcha résolu avec succès")
                    return result['request']
                elif result['error_text'] != 'CAPCHA_NOT_READY':
                    logger.error(f"❌ Erreur résolution captcha: {result}")
                    return None
                
                logger.info(f"⏳ Captcha en cours de résolution... {attempt + 1}/30")
            
            logger.error("❌ Timeout résolution captcha")
            return None
            
        except Exception as e:
            logger.error(f"❌ Erreur lors de la résolution de captcha: {e}")
            return None

class StealthBrowser:
    """Navigateur stealth avec gestion avancée des captchas"""
    
    def __init__(self, config: StealthScraperConfig):
        self.config = config
        self.driver = None
        self.captcha_solver = CaptchaSolver(config.captcha_service_key)
        self.ua = UserAgent()
        
    def create_driver(self) -> webdriver.Chrome:
        """Crée un driver Chrome ultra-stealth"""
        try:
            # Utiliser undetected-chrome pour éviter la détection
            options = uc.ChromeOptions()
            
            # Configuration stealth avancée
            stealth_args = [
                '--no-sandbox',
                '--disable-dev-shm-usage',
                '--disable-blink-features=AutomationControlled',
                '--disable-extensions',
                '--disable-plugins',
                '--disable-images',  # Plus rapide et moins suspect
                '--disable-javascript',  # Désactiver JS sauf si nécessaire
                '--user-agent=' + self.ua.random,
                '--lang=fr-FR,fr,en',
                '--disable-web-security',
                '--disable-features=VizDisplayCompositor',
                '--disable-background-timer-throttling',
                '--disable-backgrounding-occluded-windows',
                '--disable-renderer-backgrounding',
                '--disable-background-networking',
                '--disable-sync',
                '--disable-translate',
                '--disable-ipc-flooding-protection',
                '--disable-hang-monitor',
                '--disable-client-side-phishing-detection',
                '--disable-component-update',
                '--disable-default-apps',
                '--disable-domain-reliability',
                '--disable-background-downloads',
                '--disable-add-to-shelf',
                '--disable-breakpad',
                '--memory-pressure-off',
                '--max_old_space_size=4096',
                f'--user-data-dir={self.config.user_data_dir}'
            ]
            
            if self.config.headless:
                stealth_args.append('--headless=new')
            
            for arg in stealth_args:
                options.add_argument(arg)
            
            # Préférences avancées
            prefs = {
                "profile.default_content_setting_values": {
                    "images": 2,  # Bloquer les images
                    "plugins": 2,
                    "popups": 2,
                    "geolocation": 2,
                    "notifications": 2,
                    "media_stream": 2,
                },
                "profile.managed_default_content_settings": {
                    "images": 2
                }
            }
            options.add_experimental_option("prefs", prefs)
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            
            # Créer le driver avec undetected_chrome
            driver = uc.Chrome(options=options, version_main=None)
            
            # Scripts anti-détection
            driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            driver.execute_cdp_cmd('Network.setUserAgentOverride', {
                "userAgent": driver.execute_script("return navigator.userAgent").replace("Headless", "")
            })
            
            logger.info("✅ Driver Chrome stealth créé")
            return driver
            
        except Exception as e:
            logger.error(f"❌ Erreur création driver: {e}")
            raise
    
    def detect_captcha(self, driver: webdriver.Chrome) -> Dict[str, Any]:
        """Détecte la présence de captchas sur la page"""
        captcha_info = {
            'has_captcha': False,
            'captcha_type': None,
            'site_key': None,
            'element': None
        }
        
        try:
            # Rechercher reCAPTCHA
            recaptcha_elements = driver.find_elements(By.CSS_SELECTOR, 
                'div[class*="recaptcha"], iframe[src*="recaptcha"], div[data-sitekey]')
            
            if recaptcha_elements:
                captcha_info['has_captcha'] = True
                captcha_info['captcha_type'] = 'recaptcha'
                captcha_info['element'] = recaptcha_elements[0]
                
                # Extraire la site key
                for element in recaptcha_elements:
                    site_key = element.get_attribute('data-sitekey')
                    if site_key:
                        captcha_info['site_key'] = site_key
                        break
                
                logger.warning("🧩 reCAPTCHA détecté")
                return captcha_info
            
            # Rechercher hCaptcha
            hcaptcha_elements = driver.find_elements(By.CSS_SELECTOR, 
                'div[class*="hcaptcha"], iframe[src*="hcaptcha"]')
            
            if hcaptcha_elements:
                captcha_info['has_captcha'] = True
                captcha_info['captcha_type'] = 'hcaptcha'
                captcha_info['element'] = hcaptcha_elements[0]
                logger.warning("🧩 hCaptcha détecté")
                return captcha_info
            
            # Rechercher CloudFlare challenge
            cf_elements = driver.find_elements(By.CSS_SELECTOR, 
                'div[class*="cf-"], div[id*="challenge"], div[class*="challenge"]')
            
            if cf_elements or "cloudflare" in driver.page_source.lower():
                captcha_info['has_captcha'] = True
                captcha_info['captcha_type'] = 'cloudflare'
                captcha_info['element'] = cf_elements[0] if cf_elements else None
                logger.warning("☁️ Défi CloudFlare détecté")
                return captcha_info
            
            # Rechercher autres blocages
            block_indicators = [
                'access denied', 'accès refusé', 'blocked', 'bot', 
                'verify you are human', 'please wait', 'checking your browser'
            ]
            
            page_text = driver.page_source.lower()
            for indicator in block_indicators:
                if indicator in page_text:
                    captcha_info['has_captcha'] = True
                    captcha_info['captcha_type'] = 'generic_block'
                    logger.warning(f"🚫 Blocage détecté: {indicator}")
                    return captcha_info
            
        except Exception as e:
            logger.debug(f"Erreur détection captcha: {e}")
        
        return captcha_info
    
    def handle_captcha(self, driver: webdriver.Chrome, captcha_info: Dict[str, Any], url: str) -> bool:
        """Gère la résolution de captcha"""
        
        if not captcha_info['has_captcha']:
            return True
        
        captcha_type = captcha_info['captcha_type']
        
        if captcha_type == 'cloudflare':
            # Attendre que CloudFlare se résolve automatiquement
            logger.info("⏳ Attente résolution automatique CloudFlare...")
            max_wait = 30
            start_time = time.time()
            
            while time.time() - start_time < max_wait:
                try:
                    # Vérifier si on est toujours sur une page de défi
                    if "cloudflare" not in driver.page_source.lower():
                        logger.info("✅ CloudFlare résolu automatiquement")
                        return True
                    time.sleep(2)
                except:
                    break
            
            logger.warning("⚠️ CloudFlare non résolu automatiquement")
            return False
        
        elif captcha_type == 'recaptcha' and self.captcha_solver.api_key:
            # Utiliser le service de résolution
            site_key = captcha_info['site_key']
            if site_key:
                solution = self.captcha_solver.solve_recaptcha(site_key, url)
                if solution:
                    try:
                        # Injecter la solution
                        driver.execute_script(f"document.getElementById('g-recaptcha-response').innerHTML='{solution}';")
                        
                        # Cliquer sur submit si présent
                        submit_buttons = driver.find_elements(By.CSS_SELECTOR, 
                            'input[type="submit"], button[type="submit"], button.submit')
                        if submit_buttons:
                            submit_buttons[0].click()
                            time.sleep(3)
                            return True
                    except Exception as e:
                        logger.error(f"❌ Erreur injection solution captcha: {e}")
        
        # Stratégies d'évitement génériques
        logger.info("🕐 Attente longue pour contourner le blocage...")
        time.sleep(random.uniform(60, 120))
        
        # Essayer de rafraîchir la page
        try:
            driver.refresh()
            time.sleep(10)
            
            # Vérifier si le captcha a disparu
            new_captcha_info = self.detect_captcha(driver)
            if not new_captcha_info['has_captcha']:
                logger.info("✅ Blocage contourné après rafraîchissement")
                return True
        except:
            pass
        
        return False
    
    def navigate_like_human(self, driver: webdriver.Chrome, url: str) -> bool:
        """Navigation qui simule un comportement humain"""
        try:
            # Visiter d'abord la page d'accueil
            if random.random() < 0.3:  # 30% du temps
                logger.info("🏠 Visite de la page d'accueil d'abord...")
                driver.get("https://www.jobs.ch")
                time.sleep(random.uniform(3, 8))
                
                # Simuler quelques actions
                try:
                    # Scroll un peu
                    driver.execute_script("window.scrollTo(0, 300);")
                    time.sleep(random.uniform(1, 3))
                    driver.execute_script("window.scrollTo(0, 0);")
                    time.sleep(random.uniform(1, 2))
                except:
                    pass
            
            # Naviguer vers l'URL cible
            logger.info(f"🎯 Navigation vers: {url}")
            driver.get(url)
            
            # Attendre le chargement
            time.sleep(random.uniform(3, 8))
            
            # Vérifier les captchas
            captcha_info = self.detect_captcha(driver)
            if captcha_info['has_captcha']:
                logger.warning("🧩 Captcha détecté lors de la navigation")
                if not self.handle_captcha(driver, captcha_info, url):
                    return False
            
            # Simuler la lecture (scroll)
            try:
                total_height = driver.execute_script("return document.body.scrollHeight")
                viewport_height = driver.execute_script("return window.innerHeight")
                
                if total_height > viewport_height:
                    # Scroll progressif
                    scroll_steps = random.randint(2, 5)
                    for i in range(scroll_steps):
                        scroll_position = (total_height / scroll_steps) * (i + 1)
                        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
                        time.sleep(random.uniform(1, 3))
            except:
                pass
            
            return True
            
        except Exception as e:
            logger.error(f"❌ Erreur navigation: {e}")
            return False
    
    def get_page_source(self, url: str, max_attempts: int = 3) -> Optional[str]:
        """Récupère le code source d'une page avec gestion des captchas"""
        
        for attempt in range(max_attempts):
            try:
                # Créer un nouveau driver si nécessaire
                if not self.driver:
                    self.driver = self.create_driver()
                
                logger.info(f"🌐 Tentative {attempt + 1} pour: {url}")
                
                # Navigation humaine
                if not self.navigate_like_human(self.driver, url):
                    logger.warning(f"⚠️ Échec navigation tentative {attempt + 1}")
                    continue
                
                # Vérification finale des captchas
                captcha_info = self.detect_captcha(self.driver)
                if captcha_info['has_captcha']:
                    logger.warning(f"🧩 Captcha persistant, tentative {attempt + 1}")
                    if not self.handle_captcha(self.driver, captcha_info, url):
                        if attempt < max_attempts - 1:
                            # Fermer et recréer le driver
                            self.close_driver()
                            time.sleep(random.uniform(30, 60))
                            continue
                        else:
                            return None
                
                # Vérifier que la page est bien chargée
                page_source = self.driver.page_source
                if len(page_source) < 5000:  # Page trop courte, probablement un problème
                    logger.warning(f"⚠️ Page suspicieusement courte: {len(page_source)} caractères")
                    if attempt < max_attempts - 1:
                        continue
                
                logger.info(f"✅ Page récupérée avec succès: {len(page_source)} caractères")
                return page_source
                
            except Exception as e:
                logger.error(f"❌ Erreur tentative {attempt + 1}: {e}")
                if attempt < max_attempts - 1:
                    self.close_driver()
                    time.sleep(random.uniform(30, 60))
        
        return None
    
    def close_driver(self):
        """Ferme le driver proprement"""
        if self.driver:
            try:
                self.driver.quit()
            except:
                pass
            self.driver = None

class JobsChStealthScraper:
    """Scraper stealth principal"""
    
    def __init__(self, config: StealthScraperConfig):
        self.config = config
        self.browser = StealthBrowser(config)
        self.processed_urls = set()
        self.failed_urls = []
        self.scraped_count = 0
        self.setup_directories()
        self.load_progress()
        
    def setup_directories(self):
        """Crée les dossiers nécessaires"""
        Path(self.config.images_dir).mkdir(parents=True, exist_ok=True)
        Path(self.config.output_csv).parent.mkdir(parents=True, exist_ok=True)
        Path(self.config.user_data_dir).mkdir(parents=True, exist_ok=True)
        
    def save_progress(self):
        """Sauvegarde le progrès actuel"""
        progress_data = {
            'processed_urls': list(self.processed_urls),
            'failed_urls': self.failed_urls,
            'scraped_count': self.scraped_count,
            'timestamp': datetime.now().isoformat()
        }
        
        try:
            with open(self.config.progress_file, 'w', encoding='utf-8') as f:
                json.dump(progress_data, f, indent=2, ensure_ascii=False)
            logger.info(f"💾 Progrès sauvegardé: {self.scraped_count} offres traitées")
        except Exception as e:
            logger.error(f"❌ Erreur sauvegarde: {e}")

    def load_progress(self):
        """Charge le progrès précédent"""
        if not os.path.exists(self.config.progress_file):
            return
        
        try:
            with open(self.config.progress_file, 'r', encoding='utf-8') as f:
                progress_data = json.load(f)
            
            self.processed_urls = set(progress_data.get('processed_urls', []))
            self.failed_urls = progress_data.get('failed_urls', [])
            self.scraped_count = progress_data.get('scraped_count', 0)
            
            logger.info(f"📋 Progrès chargé: {len(self.processed_urls)} URLs traitées")
        except Exception as e:
            logger.error(f"❌ Erreur chargement: {e}")
    
    def clean_html_preserve_lines(self, html_content: str) -> str:
        """Nettoie le HTML en préservant les retours à la ligne"""
        if not html_content:
            return ""
        
        soup = BeautifulSoup(html_content, "html.parser")
        
        block_tags = ['p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
                     'li', 'ul', 'ol', 'blockquote', 'pre', 'hr']
        
        for tag in soup.find_all(block_tags):
            if tag.name == 'br':
                tag.replace_with('\n')
            elif tag.name in ['li']:
                tag.insert(0, '• ')
                tag.append('\n')
            elif tag.name in ['ul', 'ol']:
                tag.append('\n')
            else:
                if not tag.get_text(strip=True):
                    tag.replace_with('\n')
                else:
                    tag.append('\n')
        
        text = soup.get_text(separator='', strip=False)
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            if line:
                cleaned_lines.append(line)
            elif cleaned_lines and cleaned_lines[-1]:
                cleaned_lines.append('')
        
        result = '\n'.join(cleaned_lines)
        result = re.sub(r'\n{3,}', '\n\n', result)
        
        return result.strip()
    
    def extract_job_id_from_url(self, url: str) -> str:
        """Extrait l'ID du job depuis l'URL"""
        match = re.search(r'/detail/([^/]+)/', url)
        return match.group(1) if match else f"job_{int(time.time())}"
    
    def extract_job_data(self, html_content: str, url: str) -> Optional[Dict[str, Any]]:
        """Extrait les données d'emploi depuis le HTML"""
        
        soup = BeautifulSoup(html_content, "html.parser")
        job_id = self.extract_job_id_from_url(url)
        
        # Chercher les données JSON-LD
        scripts = soup.find_all("script", type="application/ld+json")
        
        for script in scripts:
            try:
                data_json = json.loads(script.string)
                items = data_json if isinstance(data_json, list) else [data_json]
                
                for item in items:
                    if item.get("@type") != "JobPosting":
                        continue
                    
                    title = item.get("title", "").strip()
                    description_html = item.get("description", "")
                    description_text = self.clean_html_preserve_lines(description_html)
                    
                    employer_info = item.get("hiringOrganization", {})
                    employer_name = employer_info.get("name", "").strip()
                    employer_website = employer_info.get("sameAs", "")
                    
                    job_location = item.get("jobLocation", {})
                    address_info = job_location.get("address", {})
                    
                    employment_type = item.get("employmentType", [])
                    if isinstance(employment_type, list):
                        employment_type = ', '.join(employment_type)
                    
                    job_data = {
                        "job_id": job_id,
                        "url": url,
                        "scraped_date": datetime.now().isoformat(),
                        "title": title,
                        "description": description_text,
                        "description_html": description_html,
                        "date_posted": item.get("datePosted", ""),
                        "employer_name": employer_name,
                        "employer_website": employer_website,
                        "street_address": address_info.get("streetAddress", ""),
                        "city": address_info.get("addressLocality", ""),
                        "postal_code": address_info.get("postalCode", ""),
                        "country": address_info.get("addressCountry", ""),
                        "industry": item.get("industry", ""),
                        "employment_type": employment_type,
                        "occupational_category": item.get("occupationalCategory", ""),
                        "valid_through": item.get("validThrough", ""),
                        "job_benefits": item.get("jobBenefits", ""),
                        "education_requirements": item.get("educationRequirements", ""),
                        "experience_requirements": item.get("experienceRequirements", ""),
                        "skills": item.get("skills", ""),
                        "qualifications": item.get("qualifications", "")
                    }
                    
                    logger.info(f"✅ Données extraites - {title} @ {employer_name}")
                    return job_data
                    
            except (json.JSONDecodeError, KeyError, AttributeError) as e:
                continue
        
        logger.warning(f"⚠️ Aucune donnée JobPosting trouvée pour : {url}")
        return None
    
    def scrape_url(self, url: str, index: int) -> Optional[Dict[str, Any]]:
        """Scrape une URL avec le navigateur stealth"""
        
        try:
            logger.info(f"🎯 [{index}] Scraping: {url}")
            
            # Récupérer le code source avec gestion des captchas
            html_content = self.browser.get_page_source(url)
            
            if not html_content:
                logger.error(f"❌ Impossible de récupérer le contenu: {url}")
                self.failed_urls.append(url)
                return None
            
            # Extraire les données
            job_data = self.extract_job_data(html_content, url)
            
            if job_data:
                return job_data
            else:
                logger.warning(f"⚠️ Aucune donnée extraite: {url}")
                return None
                
        except Exception as e:
            logger.error(f"❌ Erreur scraping {url}: {e}")
            self.failed_urls.append(url)
            return None
    
    def load_urls(self) -> List[str]:
        """Charge les URLs depuis le fichier d'entrée"""
        try:
            with open(self.config.input_file, 'r', encoding='utf-8') as f:
                urls = [line.strip() for line in f if line.strip() and line.strip().startswith('http')]
            logger.info(f"📋 {len(urls)} URLs chargées")
            return urls
        except Exception as e:
            logger.error(f"❌ Impossible de charger les URLs : {e}")
            return []
    
    def run(self):
        """Exécute le scraping principal"""
        urls = self.load_urls()
        if not urls:
            logger.error("Aucune URL à traiter")
            return
        
        # Filtrer les URLs déjà traitées
        urls_to_process = [url for url in urls if url not in self.processed_urls]
        logger.info(f"📊 {len(urls_to_process)} nouvelles URLs à traiter")
        
        if not urls_to_process:
            logger.info("✅ Toutes les URLs ont déjà été traitées")
            return
        
        # Mélanger pour éviter les patterns
        random.shuffle(urls_to_process)
        
        fieldnames = [
            "job_id", "url", "scraped_date", "title", "description", "description_html",
            "date_posted", "employer_name", "employer_website", "street_address", 
            "city", "postal_code", "country", "industry", "employment_type", 
            "occupational_category", "valid_through", "job_benefits",
            "education_requirements", "experience_requirements", "skills", "qualifications"
        ]
        
        file_exists = os.path.exists(self.config.output_csv)
        mode = 'a' if file_exists else 'w'
        
        with open(self.config.output_csv, mode=mode, newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
            
            if not file_exists:
                writer.writeheader()
            
            success_count = 0
            
            for i, url in enumerate(urls_to_process):
                # Reset du navigateur fréquent pour éviter la détection
                if i > 0 and i % self.config.session_reset_interval == 0:
                    logger.info(f"🔄 Reset navigateur (URL {i})")
                    self.browser.close_driver()
                    time.sleep(random.uniform(30, 60))  # Pause plus longue
                
                # Scraper l'URL
                job_data = self.scrape_url(url, i + len(self.processed_urls))
                
                if job_data:
                    writer.writerow(job_data)
                    csvfile.flush()
                    success_count += 1
                    self.scraped_count += 1
                    self.processed_urls.add(url)
                    
                    if success_count % 25 == 0:  # Sauvegarde fréquente
                        self.save_progress()
                
                # Délai très long entre les requêtes
                delay = random.uniform(*self.config.delay_range)
                logger.info(f"😴 Pause: {delay:.1f}s")
                time.sleep(delay)
        
        # Nettoyage final
        self.browser.close_driver()
        self.save_progress()
        
        logger.info(f"🎉 Scraping terminé !")
        logger.info(f"✅ Succès : {success_count}/{len(urls_to_process)}")
        logger.info(f"📊 Total traité : {self.scraped_count}")
        if self.failed_urls:
            logger.info(f"❌ Échecs : {len(self.failed_urls)}")

def main():
    """Fonction principale"""
    
    config = StealthScraperConfig(
        input_file="/root/liens/1jobs_ch_complete_extraction.csv",
        output_csv="/root/liens/jobs_ch_details.csv",
        images_dir="/root/liens/jobs_ch_images",
        max_retries=2,
        delay_range=(20, 45),  # Délais très longs
        session_reset_interval=2,  # Reset très fréquent
        timeout=60,
        use_selenium=True,
        headless=True,  # Changez à False pour voir le navigateur
        captcha_service_key="",  # Ajoutez votre clé 2captcha si disponible
        enable_stealth_mode=True
    )
    
    print("="*80)
    print("SCRAPER ULTRA-STEALTH JOBS.CH - GESTION CAPTCHAS")
    print("="*80)
    print(f"Mode Selenium: {config.use_selenium}")
    print(f"Mode headless: {config.headless}")
    print(f"Délai entre requêtes: {config.delay_range[0]}-{config.delay_range[1]}s")
    print(f"Reset navigateur: tous les {config.session_reset_interval} URLs")
    print(f"Service captcha: {'Activé' if config.captcha_service_key else 'Non configuré'}")
    print("="*80)
    
    if not os.path.exists(config.input_file):
        print(f"❌ Fichier d'entrée introuvable: {config.input_file}")
        return
    
    scraper = JobsChStealthScraper(config)
    
    try:
        scraper.run()
    except KeyboardInterrupt:
        logger.info("🛑 Arrêt demandé par l'utilisateur")
        scraper.save_progress()
    except Exception as e:
        logger.error(f"❌ Erreur fatale : {e}")
        scraper.save_progress()
        import traceback
        traceback.print_exc()
    finally:
        scraper.browser.close_driver()

if __name__ == "__main__":
    main()