jzhang-dev
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Weighted_Align‎ b/‎Weighted_Align‎
diff --git a/‎fedrann/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎fedrann/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fedrann/__main__.py‎
Lines changed: 66 additions & 91 deletions b/‎fedrann/__main__.py‎
Lines changed: 66 additions & 91 deletions
@@ -1,3 +1,5 @@
+test/
+notebooks/
 test_data/
 data/
 test/
 
@@ -4,3 +4,6 @@
 [submodule "external/robin-hood-hashing"]
 	path = external/robin-hood-hashing
 	url = git@github.com:martinus/robin-hood-hashing.git
+[submodule "Weighted_Align"]
+	path = Weighted_Align
+	url = git@github.com:CycloneSEQ-Bioinformatics/Weighted_Align.git
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
-__version__ = "v0.5.3"
+__version__ = "v0.5.7"
 __description__ = f""
 __url__ = ""
@@ -25,7 +25,8 @@
 from memory_profiler import memory_usage
 from numpy.typing import NDArray
 import logging
-
+from itertools import islice
+import pickle
 from . import __version__, __description__
 
 from .feature_extraction import (
@@ -37,6 +38,13 @@
 from .nearest_neighbors import NNDescent_ava
 from . import global_variables
 from .custom_logging import logger, add_log_file
+from .align import (
+    Seq,
+    get_overlap_candidates,
+    run_multiprocess_alignment_optimized,
+    cWeightedSemiglobalAligner,
+    AlignmentResult
+    )
 
 
 logger.setLevel(logging.DEBUG)
@@ -108,6 +116,7 @@ def parse_command_line_arguments():
         help="Minimum allowed frequency of a k-mer in all reads.",
     )
     parser.add_argument(
+        "-t",
         "--threads",
         type=int,
         required=False,
@@ -118,6 +127,7 @@ def parse_command_line_arguments():
         type=int,
         required=False,
         default=1000,
+        help="Number of reads to process in each chunk when generating the feature matrix.",
     )
 
     parser.add_argument(
@@ -150,7 +160,7 @@ def parse_command_line_arguments():
         "--save-feature-matrix",
         action="store_true",
         default=False,
-        help="Save the feature matrix to a file.",
+        help="Save the embedding feature matrix to a file.",
     )
     parser.add_argument(
         "--keep-intermediates",
@@ -182,6 +192,7 @@ def get_neighbors_ava(
     logger.info(
         f"Using {global_variables.threads} threads"
     )
+
     neighbor_indices, distances = NNDescent_ava().get_neighbors(
         embedding_matrix,
         metric="cosine",
@@ -259,43 +270,6 @@ def get_metadata_table(
     metadata_df = pd.DataFrame(metadata)
     return metadata_df
 
-def get_output_dataframe(
-    neighbor_matrix: NDArray,
-    read_names: List[str],
-    strands: list[int],
-) -> pd.DataFrame:
-    query_names = []
-    target_names = []
-    ranks = []
-    query_orientations = []
-    target_orientations = []
-
-    for query_index in range(0, neighbor_matrix.shape[0]):
-        query_name = read_names[query_index]
-        neighbors = neighbor_matrix[query_index]
-        query_orientation = ["+", "-"][strands[query_index]]
-        for rank, target_index in enumerate(neighbors):
-            if target_index == query_index:
-                continue
-            target_name = read_names[target_index]
-            target_orientation = ["+", "-"][strands[target_index]]
-            query_names.append(query_name)
-            query_orientations.append(query_orientation)
-            target_names.append(target_name)
-            target_orientations.append(target_orientation)
-            ranks.append(rank)
-
-    columns = {
-        "query_name": query_names,
-        "query_orientation": query_orientations,
-        "target_name": target_names,
-        "target_orientation": target_orientations,
-        "neighbor_rank": ranks,
-    }
-    df = pd.DataFrame(columns)
-    logger.debug(f"Output DataFrame shape: {df.shape}")
-    return df
-
 def run_fedrann_pipeline(
     *,
     input_path: str,
@@ -321,7 +295,6 @@ def run_fedrann_pipeline(
         sample_fraction=kmer_sample_fraction,
         min_multiplicity=kmer_min_multiplicity
     )
-    logger.debug(f"kmer_searcher n_features: {n_features}")
 
     logger.info("--- 2. Generate dimension reduction and IDF matrix ---")
     fwd_kmer_library_path = join(global_variables.temp_dir, "fwd_kmer_library.fasta")
@@ -330,56 +303,60 @@ def run_fedrann_pipeline(
         counter_file=fwd_kmer_library_path,
         n_features=n_features
     )
-    logger.debug(f"get_precompute_matrix n_features: {n_features}")
-    
+        
     logger.info("--- 3. Generate feature matrix ---")
     embedding_matrix = get_feature_matrix(
         ks_file=kmer_searcher_output_path,
+        fasta_file=input_path,
         precompute_matrix=precompute_mat,
         kmer_count=n_features,
         read_count=read_count,
         chunk_size=chunk_size
-    )
-
-    # # Save metadata
-    # metadata_output_file = join(output_dir, "metadata.tsv")
-    # logger.info(f"Saved metadata table to {metadata_output_file}")
-    # metadata_df = get_metadata_table(
-    #     read_names=read_names,
-    #     strands=strands,
-    # )
-    # metadata_df.to_csv(metadata_output_file, sep="\t", index=False)
-    # del read_names, strands
-    # gc.collect()
+    )    
+    encoded_reads = get_metadata(kmer_searcher_output_path,input_path,n_features)
 
-
+    if save_feature_matrix:
+        feature_matrix_file = join(output_dir, "embedding_feature_matrix.npy")
+        logger.debug(f"Saving feature matrix to {feature_matrix_file}")
+        np.save(feature_matrix_file, embedding_matrix)
+        
     # Nearest neighbors search
     logger.info("--- 4. Nearest Neighbors Search ---")
     neighbor_matrix, distances = get_neighbors_ava(
         embedding_matrix,
         nndescent_n_trees=nndescent_n_trees,
         nndescent_n_neighbors=nndescent_n_neighbors,
     )
+
     del embedding_matrix
     gc.collect()
 
-    # Save output
-    nbr_output_file = join(output_dir, "overlaps.tsv")
-    logger.debug("Saving overlap table to %s", nbr_output_file)
-
-    read_names,strands = get_metadata(
-        ks_file=kmer_searcher_output_path,
-        kmer_count=n_features,
-    )
+    logger.info("--- 5. Align candidates ---")
+    
+    overlap_candidates = get_overlap_candidates(neighbor_matrix,nndescent_n_neighbors)
+    
+    ##for test
+    overlap_candidates_file = join(output_dir, "overlaps_candidates.pkl")
+    with open(overlap_candidates_file, "wb") as f:
+        pickle.dump(overlap_candidates, f)
 
-    df = get_output_dataframe(
-        neighbor_matrix=neighbor_matrix, 
-        read_names=read_names, 
-        strands=strands
+    ####
+    nbr_output_file = join(output_dir, "overlaps.paf")
+    run_multiprocess_alignment_optimized(
+        overlap_candidates,
+        encoded_reads,
+        marker_weights=None,
+        kmer_size=kmer_size,
+        aligner=cWeightedSemiglobalAligner,
+        processes=global_variables.threads,
+        batch_size=100,
+        output_path=nbr_output_file,
+        max_total_wait_seconds=600,
     )
+    # Save output
+    
+    logger.debug("Saving overlap table to %s", nbr_output_file)
 
-    df.to_csv(nbr_output_file, sep="\t", index=False)
-
     if not keep_intermediates:
         logger.debug("Removing intermediate files")
         rmtree(global_variables.temp_dir)
@@ -425,34 +402,32 @@ def main():
         save_feature_matrix=args.save_feature_matrix,
         chunk_size=args.chunk_size
     )
+    
     if args.mprof:
-        logger.debug("Attention: Memory profiling enabled. Running with memory profiler.")
         mprof_dir = join(output_dir, "mprof")
         os.makedirs(mprof_dir, exist_ok=True)
         mprof_output_path = join(mprof_dir, "memory_profile.dat")
 
-        # 确保函数有足够的执行时间
-        @memory_usage(
-            backend="psutil",
-            interval=1,
-            multiprocess=True,
-            include_children=True,
-            timestamps=True,
-            max_usage=False,
-            stream=open(mprof_output_path, "wt")  # 直接传入文件流
-        )
-        def profiled_function():
-            return f()
-        
-        # 执行并确保文件关闭
-        try:
-            profiled_function()
-        finally:
-            # 确保文件正确关闭
-            if 'profiled_function' in locals():
-                # 获取stream并关闭
-                pass
+        with open(mprof_output_path, "wt") as f_stream:
+            logger.debug(f"Profiling to {mprof_output_path}")
+            
+            # 1. 运行并获取返回的结果列表
+            mem_result = memory_usage(
+                f, 
+                backend="psutil",
+                interval=1,               
+                multiprocess=True,
+                include_children=True,
+                timestamps=True
+            )
+
+            # 2. 手动将结果写入文件 (模拟 mprof 的格式)
+            f_stream.write("MT 1.0\n") # mprof 标识符
+            for mem, ts in mem_result:
+                f_stream.write(f"MEM {mem:.6f} {ts:.6f}\n")
+            f_stream.flush() # 强制刷入
     else:
+        # 正常执行
         f()
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+test/`
	`2`	`+notebooks/`
`1`	`3`	`test_data/`
`2`	`4`	`data/`
`3`	`5`	`test/`