ComfyUI-Manager/scanner.py

import ast
import re
import os
import json
from git import Repo
import concurrent
import datetime
import concurrent.futures
import requests
import warnings
import argparse

builtin_nodes = set()

import sys

from urllib.parse import urlparse
from github import Github, Auth
from pathlib import Path
from typing import Set, Dict, Optional

# Scanner version for cache invalidation
SCANNER_VERSION = "2.0.11"  # Multi-layer detection: class existence + display names

# Cache for extract_nodes and extract_nodes_enhanced results
_extract_nodes_cache: Dict[str, Set[str]] = {}
_extract_nodes_enhanced_cache: Dict[str, Set[str]] = {}
_file_mtime_cache: Dict[Path, float] = {}


def _get_repo_root(file_path: Path) -> Optional[Path]:
    """Find the repository root directory containing .git"""
    current = file_path if file_path.is_dir() else file_path.parent
    while current != current.parent:
        if (current / ".git").exists():
            return current
        current = current.parent
    return None


def _get_repo_hash(repo_path: Path) -> str:
    """Get git commit hash or fallback identifier"""
    git_dir = repo_path / ".git"
    if not git_dir.exists():
        return ""

    try:
        # Read HEAD to get current commit
        head_file = git_dir / "HEAD"
        if head_file.exists():
            head_content = head_file.read_text().strip()
            if head_content.startswith("ref:"):
                # HEAD points to a ref
                ref_path = git_dir / head_content[5:].strip()
                if ref_path.exists():
                    commit_hash = ref_path.read_text().strip()
                    return commit_hash[:16]  # First 16 chars
            else:
                # Detached HEAD
                return head_content[:16]
    except:
        pass

    return ""


def _load_per_repo_cache(repo_path: Path) -> Optional[tuple]:
    """Load nodes and metadata from per-repo cache

    Returns:
        tuple: (nodes_set, metadata_dict) or None if cache invalid
    """
    cache_file = repo_path / ".git" / "nodecache.json"

    if not cache_file.exists():
        return None

    try:
        with open(cache_file, 'r') as f:
            cache_data = json.load(f)

        # Verify scanner version
        if cache_data.get('scanner_version') != SCANNER_VERSION:
            return None

        # Verify git hash
        current_hash = _get_repo_hash(repo_path)
        if cache_data.get('git_hash') != current_hash:
            return None

        # Return nodes and metadata
        nodes = cache_data.get('nodes', [])
        metadata = cache_data.get('metadata', {})
        return (set(nodes) if nodes else set(), metadata)

    except:
        return None


def _save_per_repo_cache(repo_path: Path, all_nodes: Set[str], metadata: dict = None):
    """Save nodes and metadata to per-repo cache"""
    cache_file = repo_path / ".git" / "nodecache.json"

    if not cache_file.parent.exists():
        return

    git_hash = _get_repo_hash(repo_path)
    cache_data = {
        "scanner_version": SCANNER_VERSION,
        "git_hash": git_hash,
        "scanned_at": datetime.datetime.now().isoformat(),
        "nodes": sorted(list(all_nodes)),
        "metadata": metadata if metadata else {}
    }

    try:
        with open(cache_file, 'w') as f:
            json.dump(cache_data, f, indent=2)
    except:
        pass  # Silently fail - cache is optional


def download_url(url, dest_folder, filename=None):
    # Ensure the destination folder exists
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    # Extract filename from URL if not provided
    if filename is None:
        filename = os.path.basename(url)

    # Full path to save the file
    dest_path = os.path.join(dest_folder, filename)

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(dest_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
    else:
        raise Exception(f"Failed to download file from {url}")


def parse_arguments():
    """Parse command-line arguments"""
    parser = argparse.ArgumentParser(
        description='ComfyUI Manager Node Scanner',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  # Standard mode
  python3 scanner.py
  python3 scanner.py --skip-update
  python3 scanner.py --skip-all --force-rescan

  # Scan-only mode
  python3 scanner.py --scan-only temp-urls-clean.list
  python3 scanner.py --scan-only urls.list --temp-dir /custom/temp
  python3 scanner.py --scan-only urls.list --skip-update --force-rescan
        '''
    )

    parser.add_argument('--scan-only', type=str, metavar='URL_LIST_FILE',
                       help='Scan-only mode: provide URL list file (one URL per line)')
    parser.add_argument('--temp-dir', type=str, metavar='DIR',
                       help='Temporary directory for cloned repositories')
    parser.add_argument('--skip-update', action='store_true',
                       help='Skip git clone/pull operations')
    parser.add_argument('--skip-stat-update', action='store_true',
                       help='Skip GitHub stats collection')
    parser.add_argument('--skip-all', action='store_true',
                       help='Skip all update operations')
    parser.add_argument('--force-rescan', action='store_true',
                       help='Force rescan all nodes (ignore cache)')

    # Backward compatibility: positional argument for temp_dir
    parser.add_argument('temp_dir_positional', nargs='?', metavar='TEMP_DIR',
                       help='(Legacy) Temporary directory path')

    args = parser.parse_args()
    return args


# Module-level variables (will be set in main if running as script)
args = None
scan_only_mode = False
url_list_file = None
temp_dir = None
skip_update = False
skip_stat_update = True
g = None


parse_cnt = 0


def extract_nodes(code_text):
    global parse_cnt

    # Check cache first
    cache_key = hash(code_text)
    if cache_key in _extract_nodes_cache:
        return _extract_nodes_cache[cache_key].copy()

    try:
        if parse_cnt % 100 == 0:
            print(".", end="", flush=True)
        parse_cnt += 1

        code_text = re.sub(r'\\[^"\']', '', code_text)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            parsed_code = ast.parse(code_text)

        # Support both ast.Assign and ast.AnnAssign (for type-annotated assignments)
        assignments = (node for node in parsed_code.body if isinstance(node, (ast.Assign, ast.AnnAssign)))

        for assignment in assignments:
            # Handle ast.AnnAssign (e.g., NODE_CLASS_MAPPINGS: Type = {...})
            if isinstance(assignment, ast.AnnAssign):
                if isinstance(assignment.target, ast.Name) and assignment.target.id in ['NODE_CONFIG', 'NODE_CLASS_MAPPINGS']:
                    node_class_mappings = assignment.value
                    break
            # Handle ast.Assign (e.g., NODE_CLASS_MAPPINGS = {...})
            elif isinstance(assignment.targets[0], ast.Name) and assignment.targets[0].id in ['NODE_CONFIG', 'NODE_CLASS_MAPPINGS']:
                node_class_mappings = assignment.value
                break
        else:
            node_class_mappings = None

        if node_class_mappings:
            s = set()

            for key in node_class_mappings.keys:
                    if key is not None and isinstance(key.value, str):
                        s.add(key.value.strip())

            # Cache the result
            _extract_nodes_cache[cache_key] = s
            return s
        else:
            # Cache empty result
            _extract_nodes_cache[cache_key] = set()
            return set()
    except:
        # Cache empty result on error
        _extract_nodes_cache[cache_key] = set()
        return set()

def extract_nodes_from_repo(repo_path: Path, verbose: bool = False, force_rescan: bool = False) -> tuple:
    """
    Extract all nodes and metadata from a repository with per-repo caching.

    Automatically caches results in .git/nodecache.json.
    Cache is invalidated when:
    - Git commit hash changes
    - Scanner version changes
    - force_rescan flag is True

    Args:
        repo_path: Path to repository root
        verbose: If True, print UI-only extension detection messages
        force_rescan: If True, ignore cache and force fresh scan

    Returns:
        tuple: (nodes_set, metadata_dict)
    """
    # Ensure path is absolute
    repo_path = repo_path.resolve()

    # Check per-repo cache first (unless force_rescan is True)
    if not force_rescan:
        cached_result = _load_per_repo_cache(repo_path)
        if cached_result is not None:
            return cached_result

    # Cache miss - scan all .py files
    all_nodes = set()
    all_metadata = {}
    py_files = list(repo_path.rglob("*.py"))

    # Filter out __pycache__, .git, and other hidden directories
    filtered_files = []
    for f in py_files:
        try:
            rel_path = f.relative_to(repo_path)
            # Skip __pycache__, .git, and any directory starting with .
            if '__pycache__' not in str(rel_path) and not any(part.startswith('.') for part in rel_path.parts):
                filtered_files.append(f)
        except:
            continue
    py_files = filtered_files

    for py_file in py_files:
        try:
            # Read file with proper encoding
            with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:
                code = f.read()

            if code:
                # Extract nodes using SAME logic as scan_in_file
                # V1 nodes (enhanced with fallback patterns)
                nodes = extract_nodes_enhanced(code, py_file, visited=set(), verbose=verbose)
                all_nodes.update(nodes)

                # V3 nodes detection
                v3_nodes = extract_v3_nodes(code)
                all_nodes.update(v3_nodes)

                # Dict parsing - exclude commented NODE_CLASS_MAPPINGS lines
                pattern = r"_CLASS_MAPPINGS\s*(?::\s*\w+\s*)?=\s*(?:\\\s*)?{([^}]*)}"
                regex = re.compile(pattern, re.MULTILINE | re.DOTALL)

                for match_obj in regex.finditer(code):
                    # Get the line where NODE_CLASS_MAPPINGS is defined
                    match_start = match_obj.start()
                    line_start = code.rfind('\n', 0, match_start) + 1
                    line_end = code.find('\n', match_start)
                    if line_end == -1:
                        line_end = len(code)
                    line = code[line_start:line_end]

                    # Skip if line starts with # (commented)
                    if re.match(r'^\s*#', line):
                        continue

                    match = match_obj.group(1)

                    # Filter out commented lines from dict content
                    match_lines = match.split('\n')
                    match_filtered = '\n'.join(
                        line for line in match_lines
                        if not re.match(r'^\s*#', line)
                    )

                    # Extract key-value pairs with double quotes
                    key_value_pairs = re.findall(r"\"([^\"]*)\"\s*:\s*([^,\n]*)", match_filtered)
                    for key, value in key_value_pairs:
                        all_nodes.add(key.strip())

                    # Extract key-value pairs with single quotes
                    key_value_pairs = re.findall(r"'([^']*)'\s*:\s*([^,\n]*)", match_filtered)
                    for key, value in key_value_pairs:
                        all_nodes.add(key.strip())

                # Handle .update() pattern (AFTER comment removal)
                code_cleaned = re.sub(r'^#.*?$', '', code, flags=re.MULTILINE)

                update_pattern = r"_CLASS_MAPPINGS\.update\s*\(\s*{([^}]*)}\s*\)"
                update_match = re.search(update_pattern, code_cleaned, re.DOTALL)
                if update_match:
                    update_dict_text = update_match.group(1)
                    # Extract key-value pairs (double quotes)
                    update_pairs = re.findall(r'"([^"]*)"\s*:\s*([^,\n]*)', update_dict_text)
                    for key, value in update_pairs:
                        all_nodes.add(key.strip())
                    # Extract key-value pairs (single quotes)
                    update_pairs_single = re.findall(r"'([^']*)'\s*:\s*([^,\n]*)", update_dict_text)
                    for key, value in update_pairs_single:
                        all_nodes.add(key.strip())

                # Additional regex patterns (AFTER comment removal)
                patterns = [
                    r'^[^=]*_CLASS_MAPPINGS\["(.*?)"\]',
                    r'^[^=]*_CLASS_MAPPINGS\[\'(.*?)\'\]',
                    r'@register_node\("(.+)",\s*\".+"\)',
                    r'"(\w+)"\s*:\s*{"class":\s*\w+\s*'
                ]

                for pattern in patterns:
                    keys = re.findall(pattern, code_cleaned)
                    all_nodes.update(key.strip() for key in keys)

                # Extract metadata from this file
                metadata = extract_metadata_only(str(py_file))
                all_metadata.update(metadata)
        except Exception:
            # Silently skip files that can't be read
            continue

    # Save to per-repo cache
    _save_per_repo_cache(repo_path, all_nodes, all_metadata)

    return (all_nodes, all_metadata)


def _verify_class_exists(node_name: str, code_text: str, file_path: Optional[Path] = None) -> tuple[bool, Optional[str], Optional[int]]:
    """
    Verify that a node class exists and has ComfyUI node structure.

    Returns: (exists: bool, file_path: str, line_number: int)

    A valid ComfyUI node must have:
    - Class definition (not commented)
    - At least one of: INPUT_TYPES, RETURN_TYPES, FUNCTION method/attribute
    """
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            tree = ast.parse(code_text)
    except:
        return (False, None, None)

    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            if node.name == node_name or node.name.replace('_', '') == node_name.replace('_', ''):
                # Found class definition - check if it has ComfyUI interface
                has_input_types = False
                has_return_types = False
                has_function = False

                for item in node.body:
                    # Check for INPUT_TYPES method
                    if isinstance(item, ast.FunctionDef) and item.name == 'INPUT_TYPES':
                        has_input_types = True
                    # Check for RETURN_TYPES attribute
                    elif isinstance(item, ast.Assign):
                        for target in item.targets:
                            if isinstance(target, ast.Name):
                                if target.id == 'RETURN_TYPES':
                                    has_return_types = True
                                elif target.id == 'FUNCTION':
                                    has_function = True
                    # Check for FUNCTION method
                    elif isinstance(item, ast.FunctionDef):
                        has_function = True

                # Valid if has any ComfyUI signature
                if has_input_types or has_return_types or has_function:
                    file_str = str(file_path) if file_path else None
                    return (True, file_str, node.lineno)

    return (False, None, None)


def _extract_display_name_mappings(code_text: str) -> Set[str]:
    """
    Extract node names from NODE_DISPLAY_NAME_MAPPINGS.

    Pattern:
        NODE_DISPLAY_NAME_MAPPINGS = {
            "node_key": "Display Name",
            ...
        }

    Returns:
        Set of node keys from NODE_DISPLAY_NAME_MAPPINGS
    """
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            tree = ast.parse(code_text)
    except:
        return set()

    nodes = set()

    for node in tree.body:
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == 'NODE_DISPLAY_NAME_MAPPINGS':
                    if isinstance(node.value, ast.Dict):
                        for key in node.value.keys:
                            if isinstance(key, ast.Constant) and isinstance(key.value, str):
                                nodes.add(key.value.strip())

    return nodes


def extract_nodes_enhanced(
    code_text: str,
    file_path: Optional[Path] = None,
    visited: Optional[Set[Path]] = None,
    verbose: bool = False
) -> Set[str]:
    """
    Enhanced node extraction with multi-layer detection system.

    Scanner 2.0.11 - Comprehensive detection strategy:
    - Phase 1: NODE_CLASS_MAPPINGS dict literal
    - Phase 2: Class.NAME attribute access (e.g., FreeChat.NAME)
    - Phase 3: Item assignment (NODE_CLASS_MAPPINGS["key"] = value)
    - Phase 4: Class existence verification (detects active classes even if registration commented)
    - Phase 5: NODE_DISPLAY_NAME_MAPPINGS cross-reference
    - Phase 6: Empty dict detection (UI-only extensions, logging only)

    Fixed Bugs:
    - Scanner 2.0.9: Fallback cascade prevented Phase 3 execution
    - Scanner 2.0.10: Missed active classes with commented registrations (15 false negatives)

    Args:
        code_text: Python source code
        file_path: Path to file (for logging and caching)
        visited: Visited paths (for circular import prevention)
        verbose: If True, print UI-only extension detection messages

    Returns:
        Set of node names (union of all detected patterns)
    """
    # Check file-based cache if file_path provided
    if file_path is not None:
        try:
            file_path_obj = Path(file_path) if not isinstance(file_path, Path) else file_path
            if file_path_obj.exists():
                current_mtime = file_path_obj.stat().st_mtime

                # Check if we have cached result with matching mtime and scanner version
                if file_path_obj in _file_mtime_cache:
                    cached_mtime = _file_mtime_cache[file_path_obj]
                    cache_key = (str(file_path_obj), cached_mtime, SCANNER_VERSION)

                    if current_mtime == cached_mtime and cache_key in _extract_nodes_enhanced_cache:
                        return _extract_nodes_enhanced_cache[cache_key].copy()
        except:
            pass  # Ignore cache errors, proceed with normal execution

    # Suppress warnings from AST parsing
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=SyntaxWarning)
        warnings.filterwarnings('ignore', category=DeprecationWarning)

        # Phase 1: Original extract_nodes() - dict literal
        phase1_nodes = extract_nodes(code_text)

    # Phase 2: Class.NAME pattern
    if visited is None:
        visited = set()
    phase2_nodes = _fallback_classname_resolver(code_text, file_path)

    # Phase 3: Item assignment pattern
    phase3_nodes = _fallback_item_assignment(code_text)

    # Phase 4: NODE_DISPLAY_NAME_MAPPINGS cross-reference (NEW in 2.0.11)
    # This catches nodes that are in display names but not in NODE_CLASS_MAPPINGS
    phase4_nodes = _extract_display_name_mappings(code_text)

    # Phase 5: Class existence verification ONLY for display name candidates (NEW in 2.0.11)
    # This phase is CONSERVATIVE - only verify classes that appear in display names
    # This catches the specific Scanner 2.0.10 bug pattern:
    #   - NODE_CLASS_MAPPINGS registration is commented
    #   - NODE_DISPLAY_NAME_MAPPINGS still has the entry
    #   - Class implementation exists
    # Example: Bjornulf_ollamaLoader in Bjornulf_custom_nodes
    phase5_nodes = set()
    for node_name in phase4_nodes:
        # Only check classes that appear in display names but not in registrations
        if node_name not in (phase1_nodes | phase2_nodes | phase3_nodes):
            exists, _, _ = _verify_class_exists(node_name, code_text, file_path)
            if exists:
                phase5_nodes.add(node_name)

    # Union all results (FIX: Scanner 2.0.9 bug + Scanner 2.0.10 bug)
    # 2.0.9: Used early return which missed Phase 3 nodes
    # 2.0.10: Only checked registrations, missed classes referenced in display names
    all_nodes = phase1_nodes | phase2_nodes | phase3_nodes | phase4_nodes | phase5_nodes

    # Phase 6: Empty dict detector (logging only, doesn't add nodes)
    if not all_nodes:
        _fallback_empty_dict_detector(code_text, file_path, verbose)

    # Cache the result
    if file_path is not None:
        try:
            file_path_obj = Path(file_path) if not isinstance(file_path, Path) else file_path
            if file_path_obj.exists():
                current_mtime = file_path_obj.stat().st_mtime
                cache_key = (str(file_path_obj), current_mtime, SCANNER_VERSION)
                _extract_nodes_enhanced_cache[cache_key] = all_nodes
                _file_mtime_cache[file_path_obj] = current_mtime
        except:
            pass

    return all_nodes


def _fallback_classname_resolver(code_text: str, file_path: Optional[Path]) -> Set[str]:
    """
    Detect Class.NAME pattern in NODE_CLASS_MAPPINGS.

    Pattern:
        NODE_CLASS_MAPPINGS = {
            FreeChat.NAME: FreeChat,
            PaidChat.NAME: PaidChat
        }
    """
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            parsed = ast.parse(code_text)
    except:
        return set()

    nodes = set()

    for node in parsed.body:
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and target.id == 'NODE_CLASS_MAPPINGS':
                    if isinstance(node.value, ast.Dict):
                        for key in node.value.keys:
                            # Detect Class.NAME pattern
                            if isinstance(key, ast.Attribute):
                                if isinstance(key.value, ast.Name):
                                    # Use class name as node name
                                    nodes.add(key.value.id)
                            # Also handle literal strings
                            elif isinstance(key, ast.Constant) and isinstance(key.value, str):
                                nodes.add(key.value.strip())

    return nodes


def _fallback_item_assignment(code_text: str) -> Set[str]:
    """
    Detect item assignment pattern.

    Pattern:
        NODE_CLASS_MAPPINGS = {}
        NODE_CLASS_MAPPINGS["MyNode"] = MyNode
    """
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            parsed = ast.parse(code_text)
    except:
        return set()

    nodes = set()

    for node in ast.walk(parsed):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Subscript):
                    if (isinstance(target.value, ast.Name) and
                        target.value.id in ['NODE_CLASS_MAPPINGS', 'NODE_CONFIG']):
                        # Extract key
                        if isinstance(target.slice, ast.Constant):
                            if isinstance(target.slice.value, str):
                                nodes.add(target.slice.value)

    return nodes


def _extract_repo_name(file_path: Path) -> str:
    """
    Extract repository name from file path.

    Path structure: /home/rho/.tmp/analysis/temp/{author}_{reponame}/{path/to/file.py}
    Returns: {author}_{reponame} or filename if extraction fails
    """
    try:
        parts = file_path.parts
        # Find 'temp' directory in path
        if 'temp' in parts:
            temp_idx = parts.index('temp')
            if temp_idx + 1 < len(parts):
                # Next part after 'temp' is the repo directory
                return parts[temp_idx + 1]
    except (ValueError, IndexError):
        pass

    # Fallback to filename if extraction fails
    return file_path.name if hasattr(file_path, 'name') else str(file_path)


def _fallback_empty_dict_detector(code_text: str, file_path: Optional[Path], verbose: bool = False) -> None:
    """
    Detect empty NODE_CLASS_MAPPINGS (UI-only extensions).
    Logs for documentation purposes only (when verbose=True).

    Args:
        code_text: Python source code to analyze
        file_path: Path to the file being analyzed
        verbose: If True, print detection messages
    """
    empty_patterns = [
        'NODE_CLASS_MAPPINGS = {}',
        'NODE_CLASS_MAPPINGS={}',
    ]

    code_normalized = code_text.replace(' ', '').replace('\n', '')

    for pattern in empty_patterns:
        pattern_normalized = pattern.replace(' ', '')
        if pattern_normalized in code_normalized:
            if file_path and verbose:
                repo_name = _extract_repo_name(file_path)
                print(f"Info: UI-only extension (empty NODE_CLASS_MAPPINGS): {repo_name}")
            return

def has_comfy_node_base(class_node):
    """Check if class inherits from io.ComfyNode or ComfyNode"""
    for base in class_node.bases:
        # Case 1: ComfyNode
        if isinstance(base, ast.Name) and base.id == 'ComfyNode':
            return True
        # Case 2: io.ComfyNode
        elif isinstance(base, ast.Attribute):
            if base.attr == 'ComfyNode':
                return True
    return False


def extract_keyword_value(call_node, keyword):
    """
    Extract string value of keyword argument
    Schema(node_id="MyNode") -> "MyNode"
    """
    for kw in call_node.keywords:
        if kw.arg == keyword:
            # ast.Constant (Python 3.8+)
            if isinstance(kw.value, ast.Constant):
                if isinstance(kw.value.value, str):
                    return kw.value.value
            # ast.Str (Python 3.7-) - suppress deprecation warning
            else:
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore', category=DeprecationWarning)
                    if hasattr(ast, 'Str') and isinstance(kw.value, ast.Str):
                        return kw.value.s
    return None


def is_schema_call(call_node):
    """Check if ast.Call is io.Schema() or Schema()"""
    func = call_node.func
    if isinstance(func, ast.Name) and func.id == 'Schema':
        return True
    elif isinstance(func, ast.Attribute) and func.attr == 'Schema':
        return True
    return False


def extract_node_id_from_schema(class_node):
    """
    Extract node_id from define_schema() method
    """
    for item in class_node.body:
        if isinstance(item, ast.FunctionDef) and item.name == 'define_schema':
            # Walk through function body
            for stmt in ast.walk(item):
                if isinstance(stmt, ast.Call):
                    # Check if it's Schema() call
                    if is_schema_call(stmt):
                        node_id = extract_keyword_value(stmt, 'node_id')
                        if node_id:
                            return node_id
    return None


def extract_v3_nodes(code_text):
    """
    Extract V3 node IDs using AST parsing
    Returns: set of node_id strings
    """
    global parse_cnt

    try:
        if parse_cnt % 100 == 0:
            print(".", end="", flush=True)
        parse_cnt += 1

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            tree = ast.parse(code_text)
    except (SyntaxError, UnicodeDecodeError):
        return set()

    nodes = set()

    # Find io.ComfyNode subclasses
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            # Check if inherits from ComfyNode
            if has_comfy_node_base(node):
                node_id = extract_node_id_from_schema(node)
                if node_id:
                    nodes.add(node_id)

    return nodes


# scan
def extract_metadata_only(filename):
    """Extract only metadata (@author, @title, etc) without node scanning"""
    try:
        with open(filename, encoding='utf-8', errors='ignore') as file:
            code = file.read()

        metadata = {}
        lines = code.strip().split('\n')
        for line in lines:
            if line.startswith('@'):
                if line.startswith("@author:") or line.startswith("@title:") or line.startswith("@nickname:") or line.startswith("@description:"):
                    key, value = line[1:].strip().split(':', 1)
                    metadata[key.strip()] = value.strip()

        return metadata
    except:
        return {}


def scan_in_file(filename, is_builtin=False):
    global builtin_nodes

    with open(filename, encoding='utf-8', errors='ignore') as file:
        code = file.read()

    # Support type annotations (e.g., NODE_CLASS_MAPPINGS: Type = {...}) and line continuations (\)
    pattern = r"_CLASS_MAPPINGS\s*(?::\s*\w+\s*)?=\s*(?:\\\s*)?{([^}]*)}"
    regex = re.compile(pattern, re.MULTILINE | re.DOTALL)

    nodes = set()
    class_dict = {}

    # V1 nodes detection (enhanced with fallback patterns)
    nodes |= extract_nodes_enhanced(code, file_path=Path(filename), visited=set())

    # V3 nodes detection
    nodes |= extract_v3_nodes(code)
    code = re.sub(r'^#.*?$', '', code, flags=re.MULTILINE)

    def extract_keys(pattern, code):
        keys = re.findall(pattern, code)
        return {key.strip() for key in keys}

    def update_nodes(nodes, new_keys):
        nodes |= new_keys

    patterns = [
        r'^[^=]*_CLASS_MAPPINGS\["(.*?)"\]',
        r'^[^=]*_CLASS_MAPPINGS\[\'(.*?)\'\]',
        r'@register_node\("(.+)",\s*\".+"\)',
        r'"(\w+)"\s*:\s*{"class":\s*\w+\s*'
    ]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(extract_keys, pattern, code): pattern for pattern in patterns}
        for future in concurrent.futures.as_completed(futures):
            update_nodes(nodes, future.result())

    matches = regex.findall(code)
    for match in matches:
        dict_text = match

        key_value_pairs = re.findall(r"\"([^\"]*)\"\s*:\s*([^,\n]*)", dict_text)
        for key, value in key_value_pairs:
            class_dict[key.strip()] = value.strip()

        key_value_pairs = re.findall(r"'([^']*)'\s*:\s*([^,\n]*)", dict_text)
        for key, value in key_value_pairs:
            class_dict[key.strip()] = value.strip()

        for key, value in class_dict.items():
            nodes.add(key.strip())

        update_pattern = r"_CLASS_MAPPINGS.update\s*\({([^}]*)}\)"
        update_match = re.search(update_pattern, code)
        if update_match:
            update_dict_text = update_match.group(1)
            update_key_value_pairs = re.findall(r"\"([^\"]*)\"\s*:\s*([^,\n]*)", update_dict_text)
            for key, value in update_key_value_pairs:
                class_dict[key.strip()] = value.strip()
                nodes.add(key.strip())

    metadata = {}
    lines = code.strip().split('\n')
    for line in lines:
        if line.startswith('@'):
            if line.startswith("@author:") or line.startswith("@title:") or line.startswith("@nickname:") or line.startswith("@description:"):
                key, value = line[1:].strip().split(':', 1)
                metadata[key.strip()] = value.strip()

    if is_builtin:
        builtin_nodes += set(nodes)
    else:
        for x in builtin_nodes:
            if x in nodes:
                nodes.remove(x)

    return nodes, metadata


def get_py_file_paths(dirname):
    file_paths = []

    for root, dirs, files in os.walk(dirname):
        if ".git" in root or "__pycache__" in root:
            continue

        for file in files:
            if file.endswith(".py"):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)

    return file_paths


def get_nodes(target_dir):
    py_files = []
    directories = []

    for item in os.listdir(target_dir):
        if ".git" in item or "__pycache__" in item:
            continue

        path = os.path.abspath(os.path.join(target_dir, item))

        if os.path.isfile(path) and item.endswith(".py"):
            py_files.append(path)
        elif os.path.isdir(path):
            directories.append(path)

    return py_files, directories


def get_urls_from_list_file(list_file):
    """
    Read URLs from list file for scan-only mode

    Args:
        list_file (str): Path to URL list file (one URL per line)

    Returns:
        list of tuples: [(url, "", None, None), ...]
        Format: (url, title, preemptions, nodename_pattern)
        - title: Empty string
        - preemptions: None
        - nodename_pattern: None

    File format:
        https://github.com/owner/repo1
        https://github.com/owner/repo2
        # Comments starting with # are ignored

    Raises:
        FileNotFoundError: If list_file does not exist
    """
    if not os.path.exists(list_file):
        raise FileNotFoundError(f"URL list file not found: {list_file}")

    urls = []
    with open(list_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()

            # Skip empty lines and comments
            if not line or line.startswith('#'):
                continue

            # Validate URL format (basic check)
            if not (line.startswith('http://') or line.startswith('https://')):
                print(f"WARNING: Line {line_num} is not a valid URL: {line}")
                continue

            # Add URL with empty metadata
            # (url, title, preemptions, nodename_pattern)
            urls.append((line, "", None, None))

    print(f"Loaded {len(urls)} URLs from {list_file}")
    return urls


def get_git_urls_from_json(json_file):
    with open(json_file, encoding='utf-8') as file:
        data = json.load(file)

        custom_nodes = data.get('custom_nodes', [])
        git_clone_files = []
        for node in custom_nodes:
            if node.get('install_type') == 'git-clone':
                files = node.get('files', [])
                if files:
                    git_clone_files.append((files[0], node.get('title'), node.get('preemptions'), node.get('nodename_pattern')))

    git_clone_files.append(("https://github.com/comfyanonymous/ComfyUI", "ComfyUI", None, None))

    return git_clone_files


def get_py_urls_from_json(json_file):
    with open(json_file, encoding='utf-8') as file:
        data = json.load(file)

        custom_nodes = data.get('custom_nodes', [])
        py_files = []
        for node in custom_nodes:
            if node.get('install_type') == 'copy':
                files = node.get('files', [])
                if files:
                    py_files.append((files[0], node.get('title'), node.get('preemptions'), node.get('nodename_pattern')))

    return py_files


def clone_or_pull_git_repository(git_url):
    repo_name = git_url.split("/")[-1]
    if repo_name.endswith(".git"):
        repo_name = repo_name[:-4]

    repo_dir = os.path.join(temp_dir, repo_name)

    if os.path.exists(repo_dir):
        try:
            repo = Repo(repo_dir)
            origin = repo.remote(name="origin")
            origin.pull()
            repo.git.submodule('update', '--init', '--recursive')
            print(f"Pulling {repo_name}...")
        except Exception as e:
            print(f"Failed to pull '{repo_name}': {e}")
    else:
        try:
            Repo.clone_from(git_url, repo_dir, recursive=True)
            print(f"Cloning {repo_name}...")
        except Exception as e:
            print(f"Failed to clone '{repo_name}': {e}")


def update_custom_nodes(scan_only_mode=False, url_list_file=None):
    """
    Update custom nodes by cloning/pulling repositories

    Args:
        scan_only_mode (bool): If True, use URL list file instead of custom-node-list.json
        url_list_file (str): Path to URL list file (required if scan_only_mode=True)

    Returns:
        dict: node_info mapping {repo_name: (url, title, preemptions, node_pattern)}
    """
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    node_info = {}

    # Select URL source based on mode
    if scan_only_mode:
        if not url_list_file:
            raise ValueError("url_list_file is required in scan-only mode")

        git_url_titles_preemptions = get_urls_from_list_file(url_list_file)
        print("\n[Scan-Only Mode]")
        print(f"  - URL source: {url_list_file}")
        print("  - GitHub stats: DISABLED")
        print(f"  - Git clone/pull: {'ENABLED' if not skip_update else 'DISABLED'}")
        print("  - Metadata: EMPTY")
    else:
        if not os.path.exists('custom-node-list.json'):
            raise FileNotFoundError("custom-node-list.json not found")

        git_url_titles_preemptions = get_git_urls_from_json('custom-node-list.json')
        print("\n[Standard Mode]")
        print("  - URL source: custom-node-list.json")
        print(f"  - GitHub stats: {'ENABLED' if not skip_stat_update else 'DISABLED'}")
        print(f"  - Git clone/pull: {'ENABLED' if not skip_update else 'DISABLED'}")
        print("  - Metadata: FULL")

    def process_git_url_title(url, title, preemptions, node_pattern):
        name = os.path.basename(url)
        if name.endswith(".git"):
            name = name[:-4]

        node_info[name] = (url, title, preemptions, node_pattern)
        if not skip_update:
            clone_or_pull_git_repository(url)

    def process_git_stats(git_url_titles_preemptions):
        GITHUB_STATS_CACHE_FILENAME = 'github-stats-cache.json'
        GITHUB_STATS_FILENAME = 'github-stats.json'

        github_stats = {}
        try:
            with open(GITHUB_STATS_CACHE_FILENAME, 'r', encoding='utf-8') as file:
                github_stats = json.load(file)
        except FileNotFoundError:
            pass

        def is_rate_limit_exceeded():
            return g.rate_limiting[0] <= 20

        if is_rate_limit_exceeded():
            print(f"GitHub API Rate Limit Exceeded: remained - {(g.rate_limiting_resettime - datetime.datetime.now().timestamp())/60:.2f} min")
        else:
            def renew_stat(url):
                if is_rate_limit_exceeded():
                    return

                if 'github.com' not in url:
                    return None

                print('.', end="")
                sys.stdout.flush()
                try:
                    # Parsing the URL
                    parsed_url = urlparse(url)
                    domain = parsed_url.netloc
                    path = parsed_url.path
                    path_parts = path.strip("/").split("/")
                    if len(path_parts) >= 2 and domain == "github.com":
                        owner_repo = "/".join(path_parts[-2:])
                        repo = g.get_repo(owner_repo)
                        owner = repo.owner
                        now = datetime.datetime.now(datetime.timezone.utc)
                        author_time_diff = now - owner.created_at

                        last_update = repo.pushed_at.strftime("%Y-%m-%d %H:%M:%S") if repo.pushed_at else 'N/A'
                        item = {
                            "stars": repo.stargazers_count,
                            "last_update": last_update,
                            "cached_time": now.timestamp(),
                            "author_account_age_days": author_time_diff.days,
                        }
                        return url, item
                    else:
                        print(f"\nInvalid URL format for GitHub repository: {url}\n")
                except Exception as e:
                    print(f"\nERROR on {url}\n{e}")

                return None

            # resolve unresolved urls
            with concurrent.futures.ThreadPoolExecutor(11) as executor:
                futures = []
                for url, title, preemptions, node_pattern in git_url_titles_preemptions:
                    if url not in github_stats:
                        futures.append(executor.submit(renew_stat, url))

                for future in concurrent.futures.as_completed(futures):
                    url_item = future.result()
                    if url_item is not None:
                        url, item = url_item
                        github_stats[url] = item

            # renew outdated cache
            outdated_urls = []
            for k, v in github_stats.items():
                elapsed = (datetime.datetime.now().timestamp() - v['cached_time'])
                if elapsed > 60*60*12:  # 12 hours
                    outdated_urls.append(k)

            with concurrent.futures.ThreadPoolExecutor(11) as executor:
                for url in outdated_urls:
                    futures.append(executor.submit(renew_stat, url))

                for future in concurrent.futures.as_completed(futures):
                    url_item = future.result()
                    if url_item is not None:
                        url, item = url_item
                        github_stats[url] = item

            with open('github-stats-cache.json', 'w', encoding='utf-8') as file:
                json.dump(github_stats, file, ensure_ascii=False, indent=4)

        with open(GITHUB_STATS_FILENAME, 'w', encoding='utf-8') as file:
            for v in github_stats.values():
                if "cached_time" in v:
                    del v["cached_time"]

            github_stats = dict(sorted(github_stats.items()))

            json.dump(github_stats, file, ensure_ascii=False, indent=4)

        print(f"Successfully written to {GITHUB_STATS_FILENAME}.")

    if not skip_stat_update:
        process_git_stats(git_url_titles_preemptions)

    # Git clone/pull for all repositories
    with concurrent.futures.ThreadPoolExecutor(11) as executor:
        for url, title, preemptions, node_pattern in git_url_titles_preemptions:
            executor.submit(process_git_url_title, url, title, preemptions, node_pattern)

    # .py file download (skip in scan-only mode - only process git repos)
    if not scan_only_mode:
        py_url_titles_and_pattern = get_py_urls_from_json('custom-node-list.json')

        def download_and_store_info(url_title_preemptions_and_pattern):
            url, title, preemptions, node_pattern = url_title_preemptions_and_pattern
            name = os.path.basename(url)
            if name.endswith(".py"):
                node_info[name] = (url, title, preemptions, node_pattern)

            try:
                download_url(url, temp_dir)
            except:
                print(f"[ERROR] Cannot download '{url}'")

        with concurrent.futures.ThreadPoolExecutor(10) as executor:
            executor.map(download_and_store_info, py_url_titles_and_pattern)

    return node_info


def gen_json(node_info, scan_only_mode=False, force_rescan=False):
    """
    Generate extension-node-map.json from scanned node information

    Args:
        node_info (dict): Repository metadata mapping
        scan_only_mode (bool): If True, exclude metadata from output
        force_rescan (bool): If True, ignore cache and force rescan all nodes
    """
    # scan from .py file
    node_files, node_dirs = get_nodes(temp_dir)

    comfyui_path = os.path.abspath(os.path.join(temp_dir, "ComfyUI"))
    # Only reorder if ComfyUI exists in the list
    if comfyui_path in node_dirs:
        node_dirs.remove(comfyui_path)
        node_dirs = [comfyui_path] + node_dirs

    data = {}
    for dirname in node_dirs:
        py_files = get_py_file_paths(dirname)
        metadata = {}

        # Use per-repo cache for node AND metadata extraction
        try:
            nodes, metadata = extract_nodes_from_repo(Path(dirname), verbose=False, force_rescan=force_rescan)
        except:
            # Fallback to file-by-file scanning if extract_nodes_from_repo fails
            nodes = set()
            for py in py_files:
                nodes_in_file, metadata_in_file = scan_in_file(py, dirname == "ComfyUI")
                nodes.update(nodes_in_file)
                metadata.update(metadata_in_file)

        dirname = os.path.basename(dirname)

        if 'Jovimetrix' in dirname:
            pass

        if len(nodes) > 0 or (dirname in node_info and node_info[dirname][3] is not None):
            nodes = list(nodes)
            nodes.sort()

            if dirname in node_info:
                git_url, title, preemptions, node_pattern = node_info[dirname]

                # Conditionally add metadata based on mode
                if not scan_only_mode:
                    # Standard mode: include all metadata
                    metadata['title_aux'] = title

                    if preemptions is not None:
                        metadata['preemptions'] = preemptions

                    if node_pattern is not None:
                        metadata['nodename_pattern'] = node_pattern
                # Scan-only mode: metadata remains empty

                data[git_url] = (nodes, metadata)
            else:
                # Scan-only mode: Repository not in node_info (expected behavior)
                # Construct URL from dirname (author_repo format)
                if '_' in dirname:
                    parts = dirname.split('_', 1)
                    git_url = f"https://github.com/{parts[0]}/{parts[1]}"
                    data[git_url] = (nodes, metadata)
                else:
                    print(f"WARN: {dirname} is removed from custom-node-list.json")

    for file in node_files:
        nodes, metadata = scan_in_file(file)

        if len(nodes) > 0 or (dirname in node_info and node_info[dirname][3] is not None):
            nodes = list(nodes)
            nodes.sort()

            file = os.path.basename(file)

            if file in node_info:
                url, title, preemptions, node_pattern = node_info[file]

                # Conditionally add metadata based on mode
                if not scan_only_mode:
                    metadata['title_aux'] = title

                    if preemptions is not None:
                        metadata['preemptions'] = preemptions

                    if node_pattern is not None:
                        metadata['nodename_pattern'] = node_pattern

                data[url] = (nodes, metadata)
            else:
                print(f"Missing info: {file}")

    # scan from node_list.json file
    extensions = [name for name in os.listdir(temp_dir) if os.path.isdir(os.path.join(temp_dir, name))]

    for extension in extensions:
        node_list_json_path = os.path.join(temp_dir, extension, 'node_list.json')
        if os.path.exists(node_list_json_path):
            # Skip if extension not in node_info (scan-only mode with limited URLs)
            if extension not in node_info:
                continue

            git_url, title, preemptions, node_pattern = node_info[extension]

            with open(node_list_json_path, 'r', encoding='utf-8') as f:
                try:
                    node_list_json = json.load(f)
                except Exception as e:
                    print(f"\nERROR: Invalid json format '{node_list_json_path}'")
                    print("------------------------------------------------------")
                    print(e)
                    print("------------------------------------------------------")
                    node_list_json = {}

            metadata_in_url = {}
            if git_url not in data:
                nodes = set()
            else:
                nodes_in_url, metadata_in_url = data[git_url]
                nodes = set(nodes_in_url)

            try:
                for x, desc in node_list_json.items():
                    nodes.add(x.strip())
            except Exception as e:
                print(f"\nERROR: Invalid json format '{node_list_json_path}'")
                print("------------------------------------------------------")
                print(e)
                print("------------------------------------------------------")
                node_list_json = {}

            # Conditionally add metadata based on mode
            if not scan_only_mode:
                metadata_in_url['title_aux'] = title

                if preemptions is not None:
                    metadata_in_url['preemptions'] = preemptions

                if node_pattern is not None:
                    metadata_in_url['nodename_pattern'] = node_pattern

            nodes = list(nodes)
            nodes.sort()
            data[git_url] = (nodes, metadata_in_url)

    json_path = "extension-node-map.json"
    with open(json_path, "w", encoding='utf-8') as file:
        json.dump(data, file, indent=4, sort_keys=True)


if __name__ == "__main__":
    # Parse arguments
    args = parse_arguments()

    # Determine mode
    scan_only_mode = args.scan_only is not None
    url_list_file = args.scan_only if scan_only_mode else None

    # Determine temp_dir
    if args.temp_dir:
        temp_dir = args.temp_dir
    elif args.temp_dir_positional:
        temp_dir = args.temp_dir_positional
    else:
        temp_dir = os.path.join(os.getcwd(), ".tmp")

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    # Determine skip flags
    skip_update = args.skip_update or args.skip_all
    skip_stat_update = args.skip_stat_update or args.skip_all or scan_only_mode

    if not skip_stat_update:
        auth = Auth.Token(os.environ.get('GITHUB_TOKEN'))
        g = Github(auth=auth)
    else:
        g = None

    print("### ComfyUI Manager Node Scanner ###")

    if scan_only_mode:
        print(f"\n# [Scan-Only Mode] Processing URL list: {url_list_file}\n")
    else:
        print("\n# [Standard Mode] Updating extensions\n")

    # Update/clone repositories and collect node info
    updated_node_info = update_custom_nodes(scan_only_mode, url_list_file)

    print("\n# Generating 'extension-node-map.json'...\n")

    # Generate extension-node-map.json
    force_rescan = args.force_rescan if hasattr(args, 'force_rescan') else False
    if force_rescan:
        print("⚠️  Force rescan enabled - ignoring all cached results\n")
    gen_json(updated_node_info, scan_only_mode, force_rescan)

    print("\n✅ DONE.\n")

    if scan_only_mode:
        print("Output: extension-node-map.json (node mappings only)")
    else:
        print("Output: extension-node-map.json (full metadata)")