11# src/flatcode/core/scanner.py
22import sys
3+ import os
34from pathlib import Path
45from typing import Iterator , Set , List , Tuple
56
@@ -14,35 +15,78 @@ def __init__(self, root_dir: Path, ignore_rules: List[Tuple[str, bool]], extensi
1415 self .extensions = extensions
1516 self .match_all = "*" in extensions
1617
18+ def _is_binary_file (self , path : Path ) -> bool :
19+ """
20+ Reads the first 1024 bytes to check for null bytes.
21+ Returns True if likely binary, False if likely text.
22+ """
23+ try :
24+ with path .open ("rb" ) as f :
25+ chunk = f .read (1024 )
26+ return b'\0 ' in chunk
27+ except Exception :
28+ # If we can't read it (permission, etc), treat as unsafe/binary
29+ return True
30+
1731 def scan (self ) -> Iterator [FileContext ]:
18- """Yields FileContext objects for valid files."""
19- for path in self .root_dir .rglob ("*" ):
20- if not path .is_file ():
21- continue
22-
23- rel_path = path .relative_to (self .root_dir )
32+ """
33+ Walks the directory tree, pruning ignored directories efficiently,
34+ and yields FileContext objects for valid text files.
35+ """
36+ # 使用 os.walk 可以让我们修改 dirs 列表,从而阻止进入被忽略的目录 (Pruning)
37+ for root , dirs , files in os .walk (self .root_dir ):
38+ root_path = Path (root )
2439
25- # 1. Ignore Check
26- if is_path_ignored (rel_path , self .ignore_rules ):
27- continue
28-
29- # 2. Extension Check (Skip if match_all is True)
30- if not self .match_all :
31- if not (path .suffix in self .extensions or path .name in self .extensions ):
40+ # --- 1. Prune Directories (In-place modification of dirs) ---
41+ # 这里的 dirs 是一个列表,os.walk 会根据它决定下一步进入哪里。
42+ # 我们通过倒序遍历安全地移除元素。
43+ for d in list (dirs ):
44+ dir_abs_path = root_path / d
45+ try :
46+ dir_rel_path = dir_abs_path .relative_to (self .root_dir )
47+ except ValueError :
48+ continue # Should not happen in standard walk
49+
50+ # Check if directory should be ignored
51+ # We pass is_directory=True to handle "venv/" vs "venv" matching
52+ if is_path_ignored (dir_rel_path , self .ignore_rules , is_directory = True ):
53+ dirs .remove (d )
54+ # Optional: Debug output
55+ # print(f" [Debug] Pruning directory: {dir_rel_path}")
56+
57+ # --- 2. Process Files ---
58+ for f in files :
59+ file_abs_path = root_path / f
60+ try :
61+ rel_path = file_abs_path .relative_to (self .root_dir )
62+ except ValueError :
3263 continue
33-
34- # 3. Read & Tokenize
35- try :
36- content = path .read_text (encoding = "utf-8" )
37- tokens = Tokenizer .count (content )
38- yield FileContext (
39- path = path ,
40- rel_path = rel_path .as_posix (),
41- content = content ,
42- token_count = tokens
43- )
44- except UnicodeDecodeError :
45- # Silently skip binary files
46- continue
47- except Exception as e :
48- print (f" > [Warning] Skipping { rel_path .as_posix ()} (read error: { e } )" , file = sys .stderr )
64+
65+ # A. Ignore Check
66+ if is_path_ignored (rel_path , self .ignore_rules , is_directory = False ):
67+ continue
68+
69+ # B. Extension Check (Skip if match_all is True)
70+ if not self .match_all :
71+ if not (file_abs_path .suffix in self .extensions or file_abs_path .name in self .extensions ):
72+ continue
73+
74+ # C. Binary Check & Read
75+ if self ._is_binary_file (file_abs_path ):
76+ # Silently skip binary files (or log in verbose mode)
77+ continue
78+
79+ try :
80+ content = file_abs_path .read_text (encoding = "utf-8" )
81+ tokens = Tokenizer .count (content )
82+ yield FileContext (
83+ path = file_abs_path ,
84+ rel_path = rel_path .as_posix (),
85+ content = content ,
86+ token_count = tokens
87+ )
88+ except UnicodeDecodeError :
89+ # Double safety: mostly caught by _is_binary_file, but just in case
90+ continue
91+ except Exception as e :
92+ print (f" > [Warning] Skipping { rel_path .as_posix ()} (read error: { e } )" , file = sys .stderr )
0 commit comments