feat(scanner): collect and report git errors by category

Errors during git clone/pull are now accumulated in a thread-safe
collector and printed as a grouped summary after all operations
complete, instead of being scattered in per-repo log lines.

Error categories: repository_not_found, divergent_branch, auth_failed,
network_error, merge_conflict, permission_denied, other.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dr.Lt.Data 2026-02-20 09:27:13 +09:00
parent f41365abe9
commit 3c24614aeb

View File

@ -2,6 +2,8 @@ import ast
import re import re
import os import os
import json import json
import threading
from collections import defaultdict
from git import Repo from git import Repo
import concurrent import concurrent
import datetime import datetime
@ -195,6 +197,96 @@ g = None
parse_cnt = 0 parse_cnt = 0
# Thread-safe git error collector
_GIT_ERROR_COLLECTOR = {
'lock': threading.Lock(),
'by_category': defaultdict(list), # category -> list[{'repo': str, 'op': str, 'msg': str}]
}
# Ordered regex patterns for error categorization (first match wins)
_GIT_ERROR_PATTERNS = [
('repository_not_found', re.compile(
r'repository\s+not\s+found|does\s+not\s+exist|\b404\b|remote:\s*repository\s+not\s+found',
re.IGNORECASE
)),
('divergent_branch', re.compile(
r'divergent\s+branches|need\s+to\s+specify\s+how\s+to\s+reconcile\s+divergent\s+branches',
re.IGNORECASE
)),
('auth_failed', re.compile(
r'authentication\s+failed|could\s+not\s+read\s+username|invalid\s+username|invalid\s+password|auth\s+failed',
re.IGNORECASE
)),
('network_error', re.compile(
r'could\s+not\s+resolve\s+host|connection\s+refused|timed?\s*out|failed\s+to\s+connect|'
r'network\s+is\s+unreachable|temporary\s+failure\s+in\s+name\s+resolution',
re.IGNORECASE
)),
('merge_conflict', re.compile(
r'merge\s+conflict|\bCONFLICT\b|automatic\s+merge\s+failed',
re.IGNORECASE
)),
('permission_denied', re.compile(
r'permission\s+denied|access\s+denied|operation\s+not\s+permitted|publickey',
re.IGNORECASE
)),
]
_CATEGORY_LABELS = {
'repository_not_found': 'Repository Not Found',
'divergent_branch': 'Divergent Branch',
'auth_failed': 'Authentication Failed',
'network_error': 'Network Error',
'merge_conflict': 'Merge Conflict',
'permission_denied': 'Permission Denied',
'other': 'Other',
}
def _categorize_git_error(error_str: str) -> str:
"""Classify a git error string into a category. First match wins."""
for category, pattern in _GIT_ERROR_PATTERNS:
if pattern.search(error_str):
return category
return 'other'
def _record_git_error(repo_name: str, op: str, error: Exception) -> None:
"""Record a git error in the thread-safe collector."""
msg = str(error)
# Truncate very long messages
if len(msg) > 200:
msg = msg[:197] + '...'
category = _categorize_git_error(msg)
with _GIT_ERROR_COLLECTOR['lock']:
_GIT_ERROR_COLLECTOR['by_category'][category].append({
'repo': repo_name,
'op': op,
'msg': msg,
})
def _report_git_errors(errors: dict) -> None:
"""Print a grouped summary of git errors by category."""
by_category = errors.get('by_category', {})
if not by_category:
return
total = sum(len(v) for v in by_category.values())
print(f"\n{'='*60}")
print(f"Git Operation Errors Summary: {total} failure(s)")
print(f"{'='*60}")
for category, label in _CATEGORY_LABELS.items():
entries = by_category.get(category, [])
if not entries:
continue
print(f"\n[{label}] ({len(entries)} repo(s))")
for entry in entries:
print(f"{entry['repo']} ({entry['op']}): {entry['msg']}")
print(f"{'='*60}\n")
def extract_nodes(code_text): def extract_nodes(code_text):
global parse_cnt global parse_cnt
@ -1172,12 +1264,14 @@ def clone_or_pull_git_repository(git_url):
print(f"Pulling {repo_name}...") print(f"Pulling {repo_name}...")
except Exception as e: except Exception as e:
print(f"Failed to pull '{repo_name}': {e}") print(f"Failed to pull '{repo_name}': {e}")
_record_git_error(repo_name, 'pull', e)
else: else:
try: try:
Repo.clone_from(git_url, repo_dir, recursive=True) Repo.clone_from(git_url, repo_dir, recursive=True)
print(f"Cloning {repo_name}...") print(f"Cloning {repo_name}...")
except Exception as e: except Exception as e:
print(f"Failed to clone '{repo_name}': {e}") print(f"Failed to clone '{repo_name}': {e}")
_record_git_error(repo_name, 'clone', e)
def update_custom_nodes(scan_only_mode=False, url_list_file=None): def update_custom_nodes(scan_only_mode=False, url_list_file=None):
@ -1328,11 +1422,18 @@ def update_custom_nodes(scan_only_mode=False, url_list_file=None):
if not skip_stat_update: if not skip_stat_update:
process_git_stats(git_url_titles_preemptions) process_git_stats(git_url_titles_preemptions)
# Reset error collector before this run
with _GIT_ERROR_COLLECTOR['lock']:
_GIT_ERROR_COLLECTOR['by_category'].clear()
# Git clone/pull for all repositories # Git clone/pull for all repositories
with concurrent.futures.ThreadPoolExecutor(11) as executor: with concurrent.futures.ThreadPoolExecutor(11) as executor:
for url, title, preemptions, node_pattern in git_url_titles_preemptions: for url, title, preemptions, node_pattern in git_url_titles_preemptions:
executor.submit(process_git_url_title, url, title, preemptions, node_pattern) executor.submit(process_git_url_title, url, title, preemptions, node_pattern)
# Report any git errors grouped by category (after all workers complete)
_report_git_errors(_GIT_ERROR_COLLECTOR)
# .py file download (skip in scan-only mode - only process git repos) # .py file download (skip in scan-only mode - only process git repos)
if not scan_only_mode: if not scan_only_mode:
py_url_titles_and_pattern = get_py_urls_from_json('custom-node-list.json') py_url_titles_and_pattern = get_py_urls_from_json('custom-node-list.json')