diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 00000000..f831e51b --- /dev/null +++ b/TESTING.md @@ -0,0 +1,24 @@ +# Testing +In each microservice's directory, run: + +``` +./run-tests.sh +``` + +It expects [gcv-docker-compose]() to be the parent directory. Wherever that repo is on your machine: + +``` +GCV_DOCKER_COMPOSE=/home/mrbean/github/gcv-docker-compose ./run-tests.sh +``` + +File structure overview: + +``` +/ +├── tests/ +│ ├── test_.py +│ └── conftest.py # Fixtures +├── pytest.ini # Configuration +├── requirements-test.txt # Test dependencies +└── Dockerfile.test # Docker image for running tests +``` \ No newline at end of file diff --git a/chromosome/chromosome/commands.py b/chromosome/chromosome/commands.py index 4995ebd9..868e7e83 100644 --- a/chromosome/chromosome/commands.py +++ b/chromosome/chromosome/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/chromosome/requirements.txt b/chromosome/requirements.txt index ee244fac..5d6212f9 100644 --- a/chromosome/requirements.txt +++ b/chromosome/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # chromosome (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via chromosome (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # chromosome (setup.py) # grpcio-tools -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via chromosome (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via chromosome (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via chromosome (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/chromosome_region/chromosome_region/commands.py b/chromosome_region/chromosome_region/commands.py index af3401ae..c4488226 100644 --- a/chromosome_region/chromosome_region/commands.py +++ b/chromosome_region/chromosome_region/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/chromosome_region/requirements.txt b/chromosome_region/requirements.txt index 83f1a5d4..7f241258 100644 --- a/chromosome_region/requirements.txt +++ b/chromosome_region/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # chromosome_region (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via chromosome_region (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # chromosome_region (setup.py) # grpcio-tools -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via chromosome_region (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via chromosome_region (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via chromosome_region (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/chromosome_search/chromosome_search/commands.py b/chromosome_search/chromosome_search/commands.py index 28c285b6..f8c8501f 100644 --- a/chromosome_search/chromosome_search/commands.py +++ b/chromosome_search/chromosome_search/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/chromosome_search/requirements.txt b/chromosome_search/requirements.txt index 459677d0..1d01dfc8 100644 --- a/chromosome_search/requirements.txt +++ b/chromosome_search/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # chromosome_search (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via chromosome_search (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # chromosome_search (setup.py) # grpcio-tools -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via chromosome_search (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via chromosome_search (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via chromosome_search (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/gene_search/gene_search/commands.py b/gene_search/gene_search/commands.py index b4b12161..32bb40b1 100644 --- a/gene_search/gene_search/commands.py +++ b/gene_search/gene_search/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/gene_search/requirements.txt b/gene_search/requirements.txt index 16c5e924..8f0381a9 100644 --- a/gene_search/requirements.txt +++ b/gene_search/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # gene_search (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via gene_search (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # gene_search (setup.py) # grpcio-tools -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via gene_search (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via gene_search (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via gene_search (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/genes/genes/commands.py b/genes/genes/commands.py index ab6a041f..c3e900dc 100644 --- a/genes/genes/commands.py +++ b/genes/genes/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/genes/requirements.txt b/genes/requirements.txt index 4b506747..f894ce14 100644 --- a/genes/requirements.txt +++ b/genes/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # genes (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via genes (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # genes (setup.py) # grpcio-tools -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via genes (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via genes (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via genes (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/macro_synteny_blocks/Dockerfile.test b/macro_synteny_blocks/Dockerfile.test new file mode 100644 index 00000000..1605a58a --- /dev/null +++ b/macro_synteny_blocks/Dockerfile.test @@ -0,0 +1,35 @@ +FROM python:3.13.7-slim-trixie + +# install gcc and other build requirements +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY INSTALL ./ +COPY LICENSE ./ +COPY MANIFEST.in ./ +COPY README.md ./ +COPY setup.cfg ./ +COPY setup.py ./ +COPY requirements.txt ./ +COPY requirements-test.txt ./ +COPY macro_synteny_blocks/ ./macro_synteny_blocks/ +COPY proto/ ./proto/ +COPY tests/ ./tests/ +COPY pytest.ini ./ + +# install the package dependencies +RUN pip3 install --no-cache-dir -r requirements.txt + +# install test dependencies +RUN pip3 install --no-cache-dir -r requirements-test.txt + +# install (and implicitly build) the package +RUN pip3 install --no-cache-dir . + +# remove source directory to ensure tests use the installed package +RUN rm -rf macro_synteny_blocks/ + +CMD ["pytest", "-v", "--tb=short"] diff --git a/macro_synteny_blocks/compose.test.yml b/macro_synteny_blocks/compose.test.yml new file mode 100644 index 00000000..993de578 --- /dev/null +++ b/macro_synteny_blocks/compose.test.yml @@ -0,0 +1,42 @@ +# This file overrides services from +# github.com/legumeinfo/gcv-docker-compose compose.yml and compose.dev.yml +# +# Please use the convenience script: +# ./run-tests.sh + +services: + # Override redis to load test data + redis: + volumes: + # Load test data from dump.rdb + # TEST_DATA_DIR is set by run-tests.sh + - ${TEST_DATA_DIR}/tests/data/dump.rdb:/data/dump.rdb:ro + environment: + REDIS_ARGS: "" # Enable loading from dump.rdb + REDISEARCH_ARGS: "MAXSEARCHRESULTS 100000" + + # Add the test container + macro_synteny_blocks_test: + build: + # MACRO_SYNTENY_BLOCKS_DIR is set by run-tests.sh + context: ${MACRO_SYNTENY_BLOCKS_DIR:-.} + dockerfile: Dockerfile.test + depends_on: + redis: + condition: service_healthy + chromosome: + condition: service_started + genes: + condition: service_started + pairwise_macro_synteny_blocks: + condition: service_started + environment: + REDIS_HOST: redis + REDIS_PORT: "6379" + CHROMOSOME_ADDR: chromosome:81 + GENES_ADDR: genes:81 + PAIRWISE_ADDR: pairwise_macro_synteny_blocks:81 + volumes: + # Mount test results for coverage reports + - ${MACRO_SYNTENY_BLOCKS_DIR:-.}/htmlcov:/app/htmlcov:rw + profiles: [] # Always run, no profile required diff --git a/macro_synteny_blocks/macro_synteny_blocks/__main__.py b/macro_synteny_blocks/macro_synteny_blocks/__main__.py index d2d38025..baac2545 100755 --- a/macro_synteny_blocks/macro_synteny_blocks/__main__.py +++ b/macro_synteny_blocks/macro_synteny_blocks/__main__.py @@ -215,6 +215,30 @@ def parseArgs(): using the {pairwiseaddr_envvar} environment variable). """, ) + chromosomeaddr_envvar = "CHROMOSOME_ADDR" + parser.add_argument( + "--chromosomeaddr", + action=EnvArg, + envvar=chromosomeaddr_envvar, + type=str, + help=f""" + The address of the chromosome microservice (can also be specified + using the {chromosomeaddr_envvar} environment variable). Optional - enables + ComputeByChromosome endpoint if provided. + """, + ) + genesaddr_envvar = "GENES_ADDR" + parser.add_argument( + "--genesaddr", + action=EnvArg, + envvar=genesaddr_envvar, + type=str, + help=f""" + The address of the genes microservice (can also be specified + using the {genesaddr_envvar} environment variable). Optional - enables + gene position enrichment in ComputeByChromosome endpoint if provided. + """, + ) return parser.parse_args() @@ -276,7 +300,14 @@ def main(): connectToRedis(args.rhost, args.rport, args.rdb, args.rpassword) ) # create the request handler - handler = RequestHandler(redis_connection, args.pairwiseaddr) + handler = RequestHandler( + redis_connection, + args.pairwiseaddr, + chromosome_address=( + args.chromosomeaddr if hasattr(args, "chromosomeaddr") else None + ), + genes_address=args.genesaddr if hasattr(args, "genesaddr") else None, + ) # start the HTTP server if not args.nohttp: loop.create_task(run_http_server(args.hhost, args.hport, handler)) diff --git a/macro_synteny_blocks/macro_synteny_blocks/commands.py b/macro_synteny_blocks/macro_synteny_blocks/commands.py index 61694793..79be7195 100644 --- a/macro_synteny_blocks/macro_synteny_blocks/commands.py +++ b/macro_synteny_blocks/macro_synteny_blocks/commands.py @@ -24,8 +24,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -63,9 +63,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/macro_synteny_blocks/macro_synteny_blocks/grpc_client.py b/macro_synteny_blocks/macro_synteny_blocks/grpc_client.py index ad3d3857..93c78d0d 100644 --- a/macro_synteny_blocks/macro_synteny_blocks/grpc_client.py +++ b/macro_synteny_blocks/macro_synteny_blocks/grpc_client.py @@ -22,10 +22,70 @@ pairwisemacrosyntenyblocks_pb2, pairwisemacrosyntenyblocks_pb2_grpc, ) +from chromosome_service.v1 import chromosome_pb2, chromosome_pb2_grpc +from genes_service.v1 import genes_pb2, genes_pb2_grpc # isort: on +async def getChromosome(chromosome_name, address): + """ + Fetch chromosome data from the chromosome microservice. + + Parameters: + chromosome_name (str): Name of the chromosome to fetch. + address (str): Address of the chromosome microservice. + + Returns: + tuple: (gene_families list, gene_names list, chromosome_length), or None if not found. + """ + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = chromosome_pb2_grpc.ChromosomeStub(channel) + try: + result = await stub.Get( + chromosome_pb2.ChromosomeGetRequest( + name=chromosome_name, + ) + ) + return ( + list(result.chromosome.track.families), + list(result.chromosome.track.genes), + result.chromosome.length, + ) + except Exception as e: + logging.error(e) + return None + + +async def getGenes(gene_names, address): + """ + Fetch gene data from the genes microservice. + + Parameters: + gene_names (list): List of gene names to fetch. + address (str): Address of the genes microservice. + + Returns: + list: Gene objects with fmin/fmax positions, or None if error. + """ + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = genes_pb2_grpc.GenesStub(channel) + try: + result = await stub.Get( + genes_pb2.GenesGetRequest( + names=gene_names, + ) + ) + return result.genes + except Exception as e: + logging.error(e) + return None + + async def computePairwiseMacroSyntenyBlocks( chromosome, target, @@ -36,6 +96,8 @@ async def computePairwiseMacroSyntenyBlocks( chromosome_genes, chromosome_length, address, + identity=None, + correspondences=None, ): # fetch channel every time to support dynamic services channel = aio.insecure_channel(address) @@ -52,6 +114,8 @@ async def computePairwiseMacroSyntenyBlocks( optionalMetrics=metrics, chromosomeGenes=chromosome_genes, chromosomeLength=chromosome_length, + identity=identity, + correspondences=correspondences, ) ) return result.blocks diff --git a/macro_synteny_blocks/macro_synteny_blocks/grpc_server.py b/macro_synteny_blocks/macro_synteny_blocks/grpc_server.py index b8cb9253..d4c94f2f 100644 --- a/macro_synteny_blocks/macro_synteny_blocks/grpc_server.py +++ b/macro_synteny_blocks/macro_synteny_blocks/grpc_server.py @@ -45,6 +45,8 @@ async def _compute(self, request, context): metrics = request.optionalMetrics or None chromosome_genes = request.chromosomeGenes or None chromosome_length = request.chromosomeLength or None + identity = request.identity or None + correspondences = request.correspondences or None try: ( chromosome, @@ -55,6 +57,8 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) = self.handler.parseArguments( chromosome, matched, @@ -64,6 +68,8 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) except Exception: # raise a gRPC INVALID ARGUMENT error @@ -80,6 +86,9 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + grpc_decode=False, + identity=identity, + correspondences=correspondences, ) return macrosyntenyblocks_pb2.MacroSyntenyBlocksComputeReply(blocks=blocks) @@ -99,6 +108,90 @@ async def Compute(self, request, context): # return a gRPC INTERNAL error await context.abort(grpc.StatusCode.INTERNAL, "Internal server error") + # the method that handles ComputeByChromosome requests + async def _computeByChromosome(self, request, context): + # required parameters + chromosome_name = request.chromosomeName + matched = request.matched + intermediate = request.intermediate + # optional parameters + mask = request.mask or None + targets = request.targets or None + metrics = request.optionalMetrics or None + chromosome_genes = request.chromosomeGenes or None + chromosome_length = request.chromosomeLength or None + identity = request.identity or None + correspondences = request.correspondences or None + try: + ( + chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + identity, + correspondences, + ) = self.handler.parseArguments( + [chromosome_name], # Wrap in list to reuse parseArguments validation + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + identity, + correspondences, + ) + # Extract the chromosome name back from the list + chromosome_name = chromosome_name[0] + except Exception: + # raise a gRPC INVALID ARGUMENT error + await context.abort( + grpc.StatusCode.INVALID_ARGUMENT, + "Required arguments are missing or given arguments have invalid values", + ) + try: + blocks = await self.handler.processWithChromosomeName( + chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode=False, + identity=identity, + correspondences=correspondences, + ) + except ValueError as e: + # Chromosome not found or address not configured + await context.abort( + grpc.StatusCode.FAILED_PRECONDITION, + str(e), + ) + return macrosyntenyblocks_pb2.MacroSyntenyBlocksComputeReply(blocks=blocks) + + # implements the ComputeByChromosome API + async def ComputeByChromosome(self, request, context): + # subvert the gRPC exception handler via a try/except block + try: + return await self._computeByChromosome(request, context) + # let errors we raised go by + except aio.AbortError as e: + raise e + # raise an internal error to prevent non-gRPC info from being sent to users + except Exception as e: + # raise the exception after aborting so it gets logged + # NOTE: gRPC docs says abort should raise an error but it doesn't... + context.add_done_callback(self._exceptionCallbackFactory(e)) + # return a gRPC INTERNAL error + await context.abort(grpc.StatusCode.INTERNAL, "Internal server error") + async def run_grpc_server(host, port, handler): server = aio.server() diff --git a/macro_synteny_blocks/macro_synteny_blocks/http_server.py b/macro_synteny_blocks/macro_synteny_blocks/http_server.py index a378d364..13711e0b 100644 --- a/macro_synteny_blocks/macro_synteny_blocks/http_server.py +++ b/macro_synteny_blocks/macro_synteny_blocks/http_server.py @@ -56,6 +56,72 @@ async def http_post_handler(request): return json +async def http_post_by_chromosome_handler(request): + # parse the chromosome name and parameters from the POST data + data = await request.json() + # required parameters + chromosome_name = data.get("chromosomeName") + matched = data.get("matched") + intermediate = data.get("intermediate") + # optional parameters + mask = data.get("mask", None) + targets = data.get("targets", None) + metrics = data.get("optionalMetrics", None) + chromosome_genes = data.get("chromosome_genes", None) + chromosome_length = data.get("chromosome_length", None) + handler = request.app["handler"] + + # Check if chromosome address is configured + if handler.chromosome_address is None: + return web.HTTPServiceUnavailable( + text="ComputeByChromosome endpoint is not enabled. Chromosome address not configured." + ) + + try: + ( + chromosome_name_list, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + ) = handler.parseArguments( + [chromosome_name], + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + ) + chromosome_name = chromosome_name_list[0] + except Exception: + return web.HTTPBadRequest( + text="Required arguments are missing or have invalid values" + ) + + try: + blocks = await handler.processWithChromosomeName( + chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode=True, + ) + except ValueError as e: + return web.HTTPBadRequest(text=str(e)) + + json = web.json_response({"blocks": blocks}) + return json + + async def run_http_server(host, port, handler): # make the app app = web.Application() @@ -73,6 +139,11 @@ async def run_http_server(host, port, handler): ) route = app.router.add_post("/", http_post_handler) cors.add(route) + # Add the new chromosome-based endpoint + route_by_chromosome = app.router.add_post( + "/by-chromosome", http_post_by_chromosome_handler + ) + cors.add(route_by_chromosome) # run the app runner = web.AppRunner(app) await runner.setup() diff --git a/macro_synteny_blocks/macro_synteny_blocks/request_handler.py b/macro_synteny_blocks/macro_synteny_blocks/request_handler.py index 1330fcfc..aed77157 100644 --- a/macro_synteny_blocks/macro_synteny_blocks/request_handler.py +++ b/macro_synteny_blocks/macro_synteny_blocks/request_handler.py @@ -1,13 +1,21 @@ # Python import asyncio +import logging from collections import defaultdict +from redis.commands.search import reducers + # dependencies +from redis.commands.search.aggregation import AggregateRequest from redis.commands.search.query import Query # module from macro_synteny_blocks.aioredisearch import CustomAsyncSearch -from macro_synteny_blocks.grpc_client import computePairwiseMacroSyntenyBlocks +from macro_synteny_blocks.grpc_client import ( + computePairwiseMacroSyntenyBlocks, + getChromosome, + getGenes, +) class RequestHandler: @@ -15,10 +23,14 @@ def __init__( self, redis_connection, pairwise_address, + chromosome_address=None, + genes_address=None, breakpoint_characters=",.<>{}[]\"':;!@#$%^&*()-+=~", ): self.redis_connection = redis_connection self.pairwise_address = pairwise_address + self.chromosome_address = chromosome_address + self.genes_address = genes_address self.breakpoint_characters = set(breakpoint_characters) def parseArguments( @@ -31,6 +43,8 @@ def parseArguments( metrics, chromosome_genes, chromosome_length, + identity=None, + correspondences=None, ): iter(chromosome) # TypeError if not iterable if targets is None: @@ -65,6 +79,12 @@ def parseArguments( mask = int(mask) if mask <= 0: raise ValueError("mask must be positive") + # validate identity parameter + if identity is not None and identity not in ("levenshtein", "jaccard"): + raise ValueError('identity must be "levenshtein" or "jaccard"') + # validate correspondences parameter + if correspondences is not None and not isinstance(correspondences, bool): + raise ValueError("correspondences must be a boolean") return ( chromosome, matched, @@ -74,15 +94,17 @@ def parseArguments( metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) def _cleanTag(self, tag): - cleaned_tag = "" + parts = [] for c in tag: if c in self.breakpoint_characters: - cleaned_tag += "\\" - cleaned_tag += c - return cleaned_tag + parts.append("\\") + parts.append(c) + return "".join(parts) def _grpcBlockToDictBlock(self, grpc_block): dict_block = { @@ -94,58 +116,94 @@ def _grpcBlockToDictBlock(self, grpc_block): } if grpc_block.optionalMetrics: dict_block["optionalMetrics"] = list(grpc_block.optionalMetrics) + if grpc_block.HasField("identity"): + dict_block["identity"] = grpc_block.identity + if grpc_block.correspondences: + dict_block["correspondences"] = [ + { + "query_index": c.query_index, + "target_index": c.target_index, + "target_fmin": c.target_fmin, + "target_fmax": c.target_fmax, + } + for c in grpc_block.correspondences + ] return dict_block async def _getTargets(self, targets, chromosome, matched, intermediate): + BATCH_SIZE = 100 + # use a pipeline to reduce the number of calls to database pipeline = self.redis_connection.pipeline() - gene_index = CustomAsyncSearch(pipeline, index_name="geneIdx") # get genes for each family and bin them by chromosome families = set(chromosome) families.discard("") - chromosome_match_indices = defaultdict(list) - # count how many genes are in each family - query_strings = [] - count_queries = [] - for family in families: - cleaned_family = self._cleanTag(family) - query_string = "(@family:{" + cleaned_family + "})" - # limit the genes to the target chromosomes - if targets: - cleaned_targets = map(self._cleanTag, targets) - query_string += "(@chromosome:{" + "|".join(cleaned_targets) + "})" - query_strings.append(query_string) - # count how many genes are in the family - query = Query(query_string).verbatim().paging(0, 0) - count_queries.append(query) - await gene_index.search(query) # returns the pipeline, not a Result! - count_results = await pipeline.execute() - - # get the genes for each family - gene_queries = [] - for family, query_string, query, res in zip( - families, query_strings, count_queries, count_results - ): - result = gene_index.search_result(query, res) - num_genes = result.total - # get the genes - query = ( - Query(query_string) - .verbatim() - .return_fields("chromosome", "index") - .paging(0, num_genes) + # pre-compute cleaned targets string once + targets_query_part = "" + if targets: + cleaned_targets = [self._cleanTag(target) for target in targets] + targets_query_part = f"(@chromosome:{{{'|'.join(cleaned_targets)}}})" + + # clean all families once + cleaned_families = [self._cleanTag(family) for family in families] + + # split families into batches + family_batches = [] + for i in range(0, len(cleaned_families), BATCH_SIZE): + family_batches.append(cleaned_families[i : i + BATCH_SIZE]) + + # use FT.AGGREGATE to group genes by chromosome and collect indices + for batch in family_batches: + # combine all families in this batch with pipe (OR) operator + query_string = f"(@family:{{{'|'.join(batch)}}}){targets_query_part}" + + request = AggregateRequest(query_string).group_by( + "@chromosome", + reducers.tolist("@index").alias("indices"), + reducers.count().alias("gene_count"), ) - gene_queries.append(query) - await gene_index.search(query) # returns the pipeline, not a Result! - gene_results = await pipeline.execute() - # bin the genes by chromosome - for query, res in zip(gene_queries, gene_results): - result = gene_index.search_result(query, res) - for d in result.docs: - chromosome_match_indices[d.chromosome].append(int(d.index)) + pipeline.execute_command("FT.AGGREGATE", "geneIdx", *request.build_args()) + + aggregate_results = await pipeline.execute() + + if not aggregate_results or len(aggregate_results) == 0: + logging.warning("No results returned from gene aggregation pipeline") + return [] + + # bin the genes by chromosome from aggregated results + chromosome_match_indices = defaultdict(list) + + for result in aggregate_results: + if not result or len(result) < 2: + continue + + # skip the first element (total count of rows) then process each row + for row_idx in range(1, len(result)): + row = result[row_idx] + + # rows look like this, indices and gene_count are actually numbers + # ['chromosome', , 'indices', , 'gene_count', ] + chrom_name = None + indices_value = None + + # iterate through field-value pairs + for field_idx in range(0, len(row), 2): + if field_idx + 1 >= len(row): + break + field_name = row[field_idx] + field_value = row[field_idx + 1] + + if field_name == "chromosome": + chrom_name = field_value + elif field_name == "indices": + indices_value = field_value + + indices = [int(idx) for idx in indices_value] + + chromosome_match_indices[chrom_name].extend(indices) # sort index lists and filter by match and intermediate parameters filtered_targets = [] @@ -199,6 +257,8 @@ async def _computePairwiseBlocks( chromosome_length, chromosome_index, grpc_decode, + identity=None, + correspondences=None, ): # compute the blocks for the target chromosome blocks = await computePairwiseMacroSyntenyBlocks( @@ -211,6 +271,8 @@ async def _computePairwiseBlocks( chromosome_genes, chromosome_length, self.pairwise_address, + identity, + correspondences, ) if not blocks: # true for None or [] return None @@ -239,6 +301,8 @@ async def process( chromosome_genes, chromosome_length, grpc_decode=False, + identity=None, + correspondences=None, ): # connect to the index chromosome_index = CustomAsyncSearch( @@ -262,6 +326,8 @@ async def process( chromosome_length, chromosome_index, grpc_decode, + identity, + correspondences, ) for name in filtered_targets ] @@ -270,3 +336,204 @@ async def process( filtered_target_blocks = list(filter(lambda b: b is not None, target_blocks)) return filtered_target_blocks + + async def _enrichBlocksWithGeneInfo(self, blocks, query_gene_names): + """ + Enrich blocks with query gene position information. + + Parameters: + blocks: List of Blocks objects from process() + query_gene_names: List of query chromosome gene names + + Returns: + Enriched blocks with gene names and positions filled in + """ + if self.genes_address is None: + return blocks # Return blocks unchanged if genes address not configured + + # Collect all unique gene names needed (from block endpoints and correspondences) + gene_names_to_fetch = set() + for blocks_obj in blocks: + for block in blocks_obj["blocks"]: + # Handle both dict and gRPC object formats + is_dict = isinstance(block, dict) + gene_idx = block["i"] if is_dict else block.i + if gene_idx < len(query_gene_names): + gene_names_to_fetch.add(query_gene_names[gene_idx]) + gene_idx = block["j"] if is_dict else block.j + if gene_idx < len(query_gene_names): + gene_names_to_fetch.add(query_gene_names[gene_idx]) + # Also collect gene names from correspondences + correspondences = ( + block.get("correspondences", []) + if is_dict + else getattr(block, "correspondences", []) + ) + for corr in correspondences: + corr_query_idx = ( + corr["query_index"] + if isinstance(corr, dict) + else corr.query_index + ) + if corr_query_idx < len(query_gene_names): + gene_names_to_fetch.add(query_gene_names[corr_query_idx]) + + if not gene_names_to_fetch: + return blocks + + # Fetch all gene info in one call + genes = await getGenes(list(gene_names_to_fetch), self.genes_address) + if genes is None: + return blocks + + # Create lookup map + gene_map = {} + for gene in genes: + gene_map[gene.name] = gene + + # Enrich each block + for blocks_obj in blocks: + for block in blocks_obj["blocks"]: + # Handle both dict and gRPC object formats + is_dict = isinstance(block, dict) + gene_idx = block["i"] if is_dict else block.i + if gene_idx < len(query_gene_names): + gene_name = query_gene_names[gene_idx] + if gene_name in gene_map: + gene = gene_map[gene_name] + if is_dict: + block["queryGeneName"] = gene_name + block["queryGeneFmin"] = gene.fmin + block["queryGeneFmax"] = gene.fmax + else: + block.queryGeneName = gene_name + block.queryGeneFmin = gene.fmin + block.queryGeneFmax = gene.fmax + gene_idx = block["j"] if is_dict else block.j + if gene_idx < len(query_gene_names): + gene_name = query_gene_names[gene_idx] + if gene_name in gene_map: + gene = gene_map[gene_name] + if is_dict: + block["queryGeneFmin"] = min( + gene.fmin, block["queryGeneFmin"] + ) + block["queryGeneFmax"] = max( + gene.fmax, block["queryGeneFmax"] + ) + else: + block.queryGeneFmin = min(gene.fmin, block.queryGeneFmin) + block.queryGeneFmax = max(gene.fmax, block.queryGeneFmax) + + # Enrich correspondences with query gene coordinates + correspondences = ( + block.get("correspondences", []) + if is_dict + else getattr(block, "correspondences", []) + ) + for corr in correspondences: + corr_is_dict = isinstance(corr, dict) + corr_query_idx = ( + corr["query_index"] if corr_is_dict else corr.query_index + ) + if corr_query_idx < len(query_gene_names): + gene_name = query_gene_names[corr_query_idx] + if gene_name in gene_map: + gene = gene_map[gene_name] + if corr_is_dict: + corr["query_fmin"] = gene.fmin + corr["query_fmax"] = gene.fmax + else: + corr.query_fmin = gene.fmin + corr.query_fmax = gene.fmax + + return blocks + + async def _addChromosomeLengths(self, blocks): + """ + Add target chromosome lengths to Blocks objects. + + Parameters: + blocks: List of Blocks objects from process() + + Returns: + Blocks with chromosomeLength field filled in + """ + # Connect to the chromosome index + chromosome_index = CustomAsyncSearch( + self.redis_connection, index_name="chromosomeIdx" + ) + + for blocks_obj in blocks: + # Fetch the chromosome doc to get length + doc = await chromosome_index.load_document( + f"chromosome:{blocks_obj['chromosome']}" + ) + blocks_obj["chromosomeLength"] = int(doc.length) + + return blocks + + async def processWithChromosomeName( + self, + chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode=False, + identity=None, + correspondences=None, + ): + """ + Process macro synteny blocks using a chromosome name instead of gene families. + This method fetches the chromosome data from the chromosome microservice first, + and enriches the returned blocks with gene position information. + + Parameters: + chromosome_name (str): Name of the query chromosome. + Other parameters: Same as process() method. + + Returns: + Same as process() method, but with enriched blocks containing: + - queryGeneName, queryGeneFmin, queryGeneFmax (if genes_address configured) + - chromosomeLength in Blocks objects (target chromosome lengths) + - correspondences with query_fmin/query_fmax (if correspondences=True) + """ + if self.chromosome_address is None: + raise ValueError( + "Chromosome address is not configured. Cannot use ComputeByChromosome endpoint." + ) + + # Fetch chromosome data from the chromosome microservice + chromosome_data = await getChromosome(chromosome_name, self.chromosome_address) + + if chromosome_data is None: + raise ValueError(f"Chromosome '{chromosome_name}' not found") + + chromosome_families, query_gene_names, query_chromosome_length = chromosome_data + + # Use the existing process method with the fetched gene families + blocks = await self.process( + chromosome_families, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode, + identity, + correspondences, + ) + + # Enrich blocks with query gene information + blocks = await self._enrichBlocksWithGeneInfo(blocks, query_gene_names) + + # Add target chromosome lengths + blocks = await self._addChromosomeLengths(blocks) + + return blocks diff --git a/macro_synteny_blocks/proto/block/v1/block.proto b/macro_synteny_blocks/proto/block/v1/block.proto index 621bb867..27f03bee 100644 --- a/macro_synteny_blocks/proto/block/v1/block.proto +++ b/macro_synteny_blocks/proto/block/v1/block.proto @@ -8,6 +8,16 @@ message Blocks { string genus = 2; string species = 3; repeated Block blocks = 4; + optional uint32 chromosomeLength = 5; // Length of target chromosome in base pairs +} + +message Correspondence { + uint32 query_index = 1; + uint32 target_index = 2; + uint32 target_fmin = 3; // Target gene start position (bp) + uint32 target_fmax = 4; // Target gene end position (bp) + optional uint32 query_fmin = 5; // Query gene start position (bp), if enriched + optional uint32 query_fmax = 6; // Query gene end position (bp), if enriched } message Block { @@ -17,4 +27,9 @@ message Block { uint32 fmax = 4; string orientation = 5; repeated float optionalMetrics = 6; + optional string queryGeneName = 7; // Name of query gene at index i + optional uint32 queryGeneFmin = 8; // Query gene start position + optional uint32 queryGeneFmax = 9; // Query gene end position + optional float identity = 10; // Identity score (0.0-1.0) if requested + repeated Correspondence correspondences = 11; // Gene pair correspondences within the block } diff --git a/macro_synteny_blocks/proto/chromosome_service/v1/chromosome.proto b/macro_synteny_blocks/proto/chromosome_service/v1/chromosome.proto new file mode 100644 index 00000000..67abc3a2 --- /dev/null +++ b/macro_synteny_blocks/proto/chromosome_service/v1/chromosome.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +import "track/v1/track.proto"; + +package legumeinfo.microservices.chromosome_service.v1; + + +service Chromosome { + rpc Get (ChromosomeGetRequest) returns (ChromosomeGetReply) {} +} + + +message ChromosomeGetRequest { + string name = 1; +} + + +message ChromosomeGetReply { + legumeinfo.microservices.track.v1.Chromosome chromosome = 1; +} diff --git a/macro_synteny_blocks/proto/gene/v1/gene.proto b/macro_synteny_blocks/proto/gene/v1/gene.proto new file mode 100644 index 00000000..ad281da9 --- /dev/null +++ b/macro_synteny_blocks/proto/gene/v1/gene.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; + +package legumeinfo.microservices.gene.v1; + + +message Gene { + string name = 1; + uint32 fmin = 2; + uint32 fmax = 3; + int32 strand = 4; + string family = 5; + string chromosome = 6; +} diff --git a/macro_synteny_blocks/proto/genes_service/v1/genes.proto b/macro_synteny_blocks/proto/genes_service/v1/genes.proto new file mode 100644 index 00000000..a1145025 --- /dev/null +++ b/macro_synteny_blocks/proto/genes_service/v1/genes.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +import "gene/v1/gene.proto"; + +package legumeinfo.microservices.genes_service.v1; + + +service Genes { + rpc Get (GenesGetRequest) returns (GenesGetReply) {} +} + + +message GenesGetRequest { + repeated string names = 1; +} + + +message GenesGetReply { + repeated legumeinfo.microservices.gene.v1.Gene genes = 1; +} diff --git a/macro_synteny_blocks/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto b/macro_synteny_blocks/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto index 051b128e..000e2825 100644 --- a/macro_synteny_blocks/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto +++ b/macro_synteny_blocks/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto @@ -7,6 +7,7 @@ package legumeinfo.microservices.macrosyntenyblocks_service.v1; service MacroSyntenyBlocks { rpc Compute (MacroSyntenyBlocksComputeRequest) returns (MacroSyntenyBlocksComputeReply) {} + rpc ComputeByChromosome (MacroSyntenyBlocksComputeByChromosomeRequest) returns (MacroSyntenyBlocksComputeReply) {} } @@ -19,6 +20,22 @@ message MacroSyntenyBlocksComputeRequest { repeated string optionalMetrics = 6; optional uint32 chromosomeGenes = 7; optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks +} + + +message MacroSyntenyBlocksComputeByChromosomeRequest { + string chromosomeName = 1; + uint32 matched = 2; + uint32 intermediate = 3; + optional uint32 mask = 4; + repeated string targets = 5; + repeated string optionalMetrics = 6; + optional uint32 chromosomeGenes = 7; + optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks } diff --git a/macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto b/macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto index 5d12110d..173de58e 100644 --- a/macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto +++ b/macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto @@ -19,6 +19,8 @@ message PairwiseMacroSyntenyBlocksComputeRequest { repeated string optionalMetrics = 6; optional uint32 chromosomeGenes = 7; optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks } diff --git a/macro_synteny_blocks/proto/track/v1/track.proto b/macro_synteny_blocks/proto/track/v1/track.proto new file mode 100644 index 00000000..7a09cb1c --- /dev/null +++ b/macro_synteny_blocks/proto/track/v1/track.proto @@ -0,0 +1,23 @@ +syntax = "proto3"; + +package legumeinfo.microservices.track.v1; + + +message Track { + string genus = 2; + string species = 3; + repeated string genes = 4; + repeated string families = 5; +} + + +message Chromosome { + uint32 length = 1; + Track track = 2; +} + + +message MicroTrack { + string name = 1; + Track track = 2; +} diff --git a/macro_synteny_blocks/pytest.ini b/macro_synteny_blocks/pytest.ini new file mode 100644 index 00000000..163901fd --- /dev/null +++ b/macro_synteny_blocks/pytest.ini @@ -0,0 +1,18 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +asyncio_mode = auto +asyncio_default_fixture_loop_scope = function +markers = + unit: Unit tests (fast, no external dependencies) + integration: Integration tests (slower, may use Docker) + slow: Slow tests (>1s) +addopts = + -v + --tb=short + --strict-markers + --cov=macro_synteny_blocks + --cov-report=term-missing + --cov-report=html diff --git a/macro_synteny_blocks/requirements-test.txt b/macro_synteny_blocks/requirements-test.txt new file mode 100644 index 00000000..b04ed5ed --- /dev/null +++ b/macro_synteny_blocks/requirements-test.txt @@ -0,0 +1,5 @@ +pytest==8.3.5 +pytest-asyncio==0.24.0 +pytest-cov==6.0.0 +pytest-mock==3.14.0 +fakeredis==2.27.0 diff --git a/macro_synteny_blocks/requirements.txt b/macro_synteny_blocks/requirements.txt index 914dec2b..93a3bb74 100644 --- a/macro_synteny_blocks/requirements.txt +++ b/macro_synteny_blocks/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # macro_synteny_blocks (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via macro_synteny_blocks (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # grpcio-tools # macro_synteny_blocks (setup.py) -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via macro_synteny_blocks (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via macro_synteny_blocks (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via macro_synteny_blocks (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/macro_synteny_blocks/run-tests.sh b/macro_synteny_blocks/run-tests.sh new file mode 100755 index 00000000..13fa670f --- /dev/null +++ b/macro_synteny_blocks/run-tests.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# +# This script uses the base compose configuration from gcv-docker-compose and overlays +# test-specific configuration. +# +# The GCV_DOCKER_COMPOSE environment variable should point to the gcv-docker-compose +# directory (defaults to ../ - the parent directory). +# +# Usage: +# ./run-tests.sh + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Compose file paths +# Use gcv-docker-compose directory for base configuration +GCV_DOCKER_COMPOSE="${GCV_DOCKER_COMPOSE:-../}" +COMPOSE_BASE="$GCV_DOCKER_COMPOSE/compose.yml" +COMPOSE_DEV="$GCV_DOCKER_COMPOSE/compose.dev.yml" +COMPOSE_TEST="compose.test.yml" + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Running macro_synteny_blocks tests${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" + +# Verify compose files exist +if [ ! -f "$COMPOSE_BASE" ]; then + echo -e "${RED}Error: $COMPOSE_BASE not found${NC}" + echo -e "${YELLOW}Set GCV_DOCKER_COMPOSE environment variable to the gcv-docker-compose directory${NC}" + echo -e "${YELLOW}Default: ../ (parent directory)${NC}" + exit 1 +fi + +if [ ! -f "$COMPOSE_DEV" ]; then + echo -e "${RED}Error: $COMPOSE_DEV not found${NC}" + exit 1 +fi + +echo -e "${YELLOW}Using compose files from: ${GCV_DOCKER_COMPOSE}${NC}" + +# Save current directory +ORIGINAL_DIR=$(pwd) + +# Export paths for use in compose.test.yml +export MACRO_SYNTENY_BLOCKS_DIR="$ORIGINAL_DIR" +export TEST_DATA_DIR="$(dirname "$ORIGINAL_DIR")" # Parent of macro_synteny_blocks + +# Change to GCV docker compose directory for running compose +cd "$GCV_DOCKER_COMPOSE" + +# Make compose.test.yml path absolute +COMPOSE_TEST_ABS="$ORIGINAL_DIR/compose.test.yml" + +# Clean up any existing containers +echo -e "${YELLOW}Cleaning up existing containers...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v 2>/dev/null || true + +# Build and run tests +echo -e "${YELLOW}Building and starting services...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services up --build --abort-on-container-exit --exit-code-from macro_synteny_blocks_test + +# Capture exit code +TEST_EXIT_CODE=$? + +# Clean up +echo "" +echo -e "${YELLOW}Cleaning up...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v + +# Return to original directory +cd "$ORIGINAL_DIR" + +# Report results +echo "" +echo -e "${GREEN}========================================${NC}" +if [ $TEST_EXIT_CODE -eq 0 ]; then + echo -e "${GREEN}✓ Tests passed!${NC}" +else + echo -e "${RED}✗ Tests failed with exit code $TEST_EXIT_CODE${NC}" +fi +echo -e "${GREEN}========================================${NC}" +echo "" +echo -e "Coverage report available in: ${YELLOW}./htmlcov/index.html${NC}" + +exit $TEST_EXIT_CODE diff --git a/macro_synteny_blocks/tests/conftest.py b/macro_synteny_blocks/tests/conftest.py new file mode 100644 index 00000000..d41b1ca1 --- /dev/null +++ b/macro_synteny_blocks/tests/conftest.py @@ -0,0 +1,228 @@ +import os +from collections import namedtuple +from unittest.mock import AsyncMock, MagicMock + +import fakeredis.aioredis +import pytest +import redis.asyncio as aioredis + + +@pytest.fixture +async def redis_connection(): + """Real Redis connection for integration tests (from compose.test.yml).""" + redis_host = os.getenv("REDIS_HOST", "redis") + redis_port = int(os.getenv("REDIS_PORT", "6379")) + + redis_conn = await aioredis.from_url( + f"redis://{redis_host}:{redis_port}", decode_responses=True + ) + + yield redis_conn + + await redis_conn.aclose() + + +@pytest.fixture +async def fakeredis_connection(): + """In-memory Redis for unit tests.""" + redis = fakeredis.aioredis.FakeRedis(decode_responses=True) + yield redis + await redis.aclose() + + +@pytest.fixture +async def redis_with_gene_index(fakeredis_connection): + """Fixture providing fakeredis with gene index data for batching tests.""" + redis = fakeredis_connection + + # Create gene documents in the gene index + # Simulate genes for multiple chromosomes with various families + test_data = [ + # chr1 genes + ("gene1_chr1", "fam1", "chr1", 0), + ("gene2_chr1", "fam2", "chr1", 1), + ("gene3_chr1", "fam3", "chr1", 2), + ("gene4_chr1", "fam1", "chr1", 3), # Duplicate family + # chr2 genes + ("gene1_chr2", "fam1", "chr2", 0), + ("gene2_chr2", "fam4", "chr2", 1), + ("gene3_chr2", "fam5", "chr2", 2), + # chr3 genes - more matches + ("gene1_chr3", "fam1", "chr3", 0), + ("gene2_chr3", "fam2", "chr3", 1), + ("gene3_chr3", "fam3", "chr3", 2), + ("gene4_chr3", "fam1", "chr3", 3), + ("gene5_chr3", "fam2", "chr3", 4), + ] + + for gene_name, family, chromosome, index in test_data: + # In fakeredis, we need to manually add documents for search + # This is a simplified version - real implementation would use ft.add + await redis.hset( + f"gene:{gene_name}", + mapping={ + "name": gene_name, + "family": family, + "chromosome": chromosome, + "index": str(index), + }, + ) + + # Create chromosome documents + for chr_id in ["chr1", "chr2", "chr3"]: + await redis.hset( + f"chromosome:{chr_id}", + mapping={ + "name": chr_id, + "genus": "Test", + "species": "species", + "length": "10000", + }, + ) + + yield redis + + +@pytest.fixture +async def redis_with_chromosomes(fakeredis_connection): + """Fixture with chromosome data for enrichment tests.""" + redis = fakeredis_connection + + # Create chromosomes with gene families + chromosomes = { + "chr1": { + "families": ["fam1", "fam2", "fam3", "fam4"], + "genes": ["gene1", "gene2", "gene3", "gene4"], + "length": 10000, + }, + "chr2": { + "families": ["fam1", "fam2", "fam5"], + "genes": ["gene5", "gene6", "gene7"], + "length": 8000, + }, + } + + for chr_id, data in chromosomes.items(): + await redis.hset( + f"chromosome:{chr_id}", + mapping={ + "name": chr_id, + "genus": "Test", + "species": "species", + "length": str(data["length"]), + }, + ) + + yield redis + + +# Mock gRPC objects for testing +Gene = namedtuple("Gene", ["name", "fmin", "fmax"]) + + +@pytest.fixture +def mock_genes_service(): + """Mock genes microservice gRPC client.""" + + async def mock_getGenes(gene_names, address): + # Return mock gene objects with positions + # Position is based on the gene number in the name (e.g., "gene1" -> index 0, "gene3" -> index 2) + genes = [] + for name in gene_names: + # Extract the gene number from the name (e.g., "gene1" -> 1, "gene3" -> 3) + gene_number = int(name.replace("gene", "")) + # Calculate position based on gene number (gene1 at index 0, gene2 at index 1, etc.) + gene_index = gene_number - 1 + genes.append( + Gene(name=name, fmin=gene_index * 1000, fmax=gene_index * 1000 + 999) + ) + return genes + + return mock_getGenes + + +@pytest.fixture +def mock_chromosome_service(): + """Mock chromosome microservice gRPC client.""" + + async def mock_getChromosome(chromosome_name, address): + # Return mock chromosome data: (families, gene_names, length) + if chromosome_name == "test_chr": + return ( + ["fam1", "fam2", "fam3", "fam4", "fam5"], + ["gene1", "gene2", "gene3", "gene4", "gene5"], + 10000, + ) + elif chromosome_name == "chr1": + return ( + ["fam1", "fam2", "fam3"], + ["gene1", "gene2", "gene3"], + 8000, + ) + return None + + return mock_getChromosome + + +@pytest.fixture +def mock_pairwise_service(): + """Mock pairwise-macro-synteny-blocks gRPC client.""" + + async def mock_computePairwise( + chromosome, + target, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + address, + ): + # Return mock blocks as dicts to allow dynamic attribute assignment + # This simulates gRPC objects that can have attributes added + class MockBlock: + def __init__(self, i, j, fmin, fmax, orientation): + self.i = i + self.j = j + self.fmin = fmin + self.fmax = fmax + self.orientation = orientation + + return [ + MockBlock(i=0, j=2, fmin=0, fmax=2999, orientation="+"), + MockBlock(i=3, j=4, fmin=3000, fmax=4999, orientation="+"), + ] + + return mock_computePairwise + + +@pytest.fixture +def sample_blocks(): + """Sample block data for testing enrichment.""" + # Blocks in dict format (as returned by process()) + return [ + { + "chromosome": "chr1", + "genus": "Test", + "species": "species", + "blocks": [ + {"i": 0, "j": 2, "fmin": 0, "fmax": 2999, "orientation": "+"}, + {"i": 3, "j": 4, "fmin": 3000, "fmax": 4999, "orientation": "-"}, + ], + }, + { + "chromosome": "chr2", + "genus": "Test", + "species": "species", + "blocks": [ + {"i": 1, "j": 3, "fmin": 1000, "fmax": 3999, "orientation": "+"}, + ], + }, + ] + + +@pytest.fixture +def sample_query_gene_names(): + """Sample query gene names for enrichment tests.""" + return ["gene1", "gene2", "gene3", "gene4", "gene5"] diff --git a/macro_synteny_blocks/tests/test_request_handler.py b/macro_synteny_blocks/tests/test_request_handler.py new file mode 100644 index 00000000..a6059ad4 --- /dev/null +++ b/macro_synteny_blocks/tests/test_request_handler.py @@ -0,0 +1,564 @@ +from collections import namedtuple +from unittest.mock import patch + +import pytest + +from macro_synteny_blocks.request_handler import RequestHandler + + +@pytest.mark.unit +class TestCleanTag: + """Test RediSearch special character escaping.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, pairwise_address="localhost:8081" + ) + + def test_escape_special_characters(self): + """Test that RediSearch breakpoint characters are escaped.""" + tag = "aradu.V14167" + cleaned = self.handler._cleanTag(tag) + + assert cleaned == "aradu\\.V14167" + + def test_multiple_special_characters(self): + """Test escaping of multiple special characters.""" + tag = "test-name.version:1" + cleaned = self.handler._cleanTag(tag) + + # -, ., and : should all be escaped + assert "\\" in cleaned + assert cleaned == "test\\-name\\.version\\:1" + + def test_no_special_characters(self): + """Test that regular strings pass through unchanged.""" + tag = "simplestring" + cleaned = self.handler._cleanTag(tag) + + assert cleaned == "simplestring" + + +@pytest.mark.integration +@pytest.mark.asyncio +class TestGetTargets: + """Test the batching optimization in _getTargets().""" + + async def test_batching_with_large_family_set(self, redis_connection): + """Test that >100 families trigger batching logic.""" + # Create a large set of families (>100) to test batching + chromosome = [f"test_batch_fam{i}" for i in range(150)] + + # Populate Redis with test data + # Create genes on chr1 and chr2 with some of the families + for i in range(10): + await redis_connection.hset( + f"gene:test_batch_gene_chr1_{i}", + mapping={ + "name": f"test_batch_gene_chr1_{i}", + "family": f"test_batch_fam{i}", + "chromosome": "test_batch_chr1", + "index": str(i), + }, + ) + + for i in range(5): + await redis_connection.hset( + f"gene:test_batch_gene_chr2_{i}", + mapping={ + "name": f"test_batch_gene_chr2_{i}", + "family": f"test_batch_fam{i + 100}", + "chromosome": "test_batch_chr2", + "index": str(i), + }, + ) + + handler = RequestHandler( + redis_connection=redis_connection, pairwise_address="localhost:8081" + ) + + # Call _getTargets with large chromosome + targets = await handler._getTargets( + targets=[], chromosome=chromosome, matched=2, intermediate=50 + ) + + # Should return the chromosomes that have matches + assert len(targets) >= 0 # At least some targets should be found + + # Clean up test data + for i in range(10): + await redis_connection.delete(f"gene:test_batch_gene_chr1_{i}") + for i in range(5): + await redis_connection.delete(f"gene:test_batch_gene_chr2_{i}") + + async def test_filtering_by_matched(self, redis_connection): + """Test that chromosomes with insufficient matches are filtered.""" + chromosome = [ + "test_match_fam1", + "test_match_fam2", + "test_match_fam3", + "test_match_fam4", + ] + + # chr1 has only 1 match (below matched=3) + await redis_connection.hset( + "gene:test_match_chr1_gene1", + mapping={ + "name": "test_match_chr1_gene1", + "family": "test_match_fam1", + "chromosome": "test_match_chr1", + "index": "0", + }, + ) + + # chr2 has 4 matches (meets matched=3) + for i in range(4): + await redis_connection.hset( + f"gene:test_match_chr2_gene{i}", + mapping={ + "name": f"test_match_chr2_gene{i}", + "family": f"test_match_fam{i + 1}", + "chromosome": "test_match_chr2", + "index": str(i), + }, + ) + + handler = RequestHandler( + redis_connection=redis_connection, pairwise_address="localhost:8081" + ) + + targets = await handler._getTargets( + targets=[], + chromosome=chromosome, + matched=3, # Minimum 3 matches required + intermediate=10, + ) + + # Only chr2 should pass the filter + assert "test_match_chr2" in targets + assert "test_match_chr1" not in targets + + # Clean up test data + await redis_connection.delete("gene:test_match_chr1_gene1") + for i in range(4): + await redis_connection.delete(f"gene:test_match_chr2_gene{i}") + + async def test_filtering_by_intermediate(self, redis_connection): + """Test that chromosomes where matches are too sparse are filtered.""" + chromosome = [ + "test_inter_fam1", + "test_inter_fam2", + "test_inter_fam3", + "test_inter_fam4", + ] + + # chr1: matches at indices 0, 1, 2 (close together) + for i in range(3): + await redis_connection.hset( + f"gene:test_inter_chr1_gene{i}", + mapping={ + "name": f"test_inter_chr1_gene{i}", + "family": f"test_inter_fam{i + 1}", + "chromosome": "test_inter_chr1", + "index": str(i), + }, + ) + + # chr2: matches at indices 0, 10, 20 (too far apart with intermediate=5) + for idx, i in enumerate([0, 10, 20]): + await redis_connection.hset( + f"gene:test_inter_chr2_gene{idx}", + mapping={ + "name": f"test_inter_chr2_gene{idx}", + "family": f"test_inter_fam{idx + 1}", + "chromosome": "test_inter_chr2", + "index": str(i), + }, + ) + + handler = RequestHandler( + redis_connection=redis_connection, pairwise_address="localhost:8081" + ) + + targets = await handler._getTargets( + targets=[], + chromosome=chromosome, + matched=3, + intermediate=5, # Max gap of 5 between matches + ) + + # chr1 should pass (matches are close) + assert "test_inter_chr1" in targets + # chr2 should fail (matches too far apart) + assert "test_inter_chr2" not in targets + + # Clean up test data + for i in range(3): + await redis_connection.delete(f"gene:test_inter_chr1_gene{i}") + for idx in range(3): + await redis_connection.delete(f"gene:test_inter_chr2_gene{idx}") + + async def test_with_targets_filter(self, redis_connection): + """Test that targets parameter filters chromosomes.""" + chromosome = ["test_target_fam1", "test_target_fam2", "test_target_fam3"] + + # Both chr1 and chr2 have enough matches + for i in range(3): + await redis_connection.hset( + f"gene:test_target_chr1_gene{i}", + mapping={ + "name": f"test_target_chr1_gene{i}", + "family": f"test_target_fam{i + 1}", + "chromosome": "test_target_chr1", + "index": str(i), + }, + ) + await redis_connection.hset( + f"gene:test_target_chr2_gene{i}", + mapping={ + "name": f"test_target_chr2_gene{i}", + "family": f"test_target_fam{i + 1}", + "chromosome": "test_target_chr2", + "index": str(i), + }, + ) + + handler = RequestHandler( + redis_connection=redis_connection, pairwise_address="localhost:8081" + ) + + # Only request chr1 + targets = await handler._getTargets( + targets=["test_target_chr1"], # Filter to only chr1 + chromosome=chromosome, + matched=2, + intermediate=10, + ) + + # Should only return chr1, even though chr2 also meets criteria + assert "test_target_chr1" in targets + assert "test_target_chr2" not in targets + + # Clean up test data + for i in range(3): + await redis_connection.delete(f"gene:test_target_chr1_gene{i}") + await redis_connection.delete(f"gene:test_target_chr2_gene{i}") + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestEnrichBlocksWithGeneInfo: + """Test the new gene enrichment feature.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, pairwise_address="localhost:8081" + ) + + async def test_enrichment_with_genes_address( + self, sample_blocks, sample_query_gene_names, mock_genes_service + ): + """Test that blocks are enriched when genes_address is configured.""" + self.handler.genes_address = "localhost:8082" + + # Mock getGenes function + with patch( + "macro_synteny_blocks.request_handler.getGenes", new=mock_genes_service + ): + enriched = await self.handler._enrichBlocksWithGeneInfo( + sample_blocks, sample_query_gene_names + ) + + # Check that gene info was added + for blocks_obj in enriched: + for block in blocks_obj["blocks"]: + assert "queryGeneName" in block + assert "queryGeneFmin" in block + assert "queryGeneFmax" in block + + async def test_enrichment_uses_both_terminal_genes( + self, sample_blocks, sample_query_gene_names, mock_genes_service + ): + """Test that enrichment uses both block.i and block.j (recent change).""" + self.handler.genes_address = "localhost:8082" + + genes_fetched = set() + + async def tracking_mock_getGenes(gene_names, address): + genes_fetched.update(gene_names) + return await mock_genes_service(gene_names, address) + + with patch( + "macro_synteny_blocks.request_handler.getGenes", new=tracking_mock_getGenes + ): + await self.handler._enrichBlocksWithGeneInfo( + sample_blocks, sample_query_gene_names + ) + + # Should have fetched genes at both i and j indices + # Block at i=0, j=2 should fetch gene1 (index 0) and gene3 (index 2) + assert "gene1" in genes_fetched + assert "gene3" in genes_fetched + # Block at i=3, j=4 should fetch gene4 and gene5 + assert "gene4" in genes_fetched + assert "gene5" in genes_fetched + + async def test_enrichment_without_genes_address( + self, sample_blocks, sample_query_gene_names + ): + """Test that blocks are unchanged when genes_address is None.""" + self.handler.genes_address = None + + enriched = await self.handler._enrichBlocksWithGeneInfo( + sample_blocks, sample_query_gene_names + ) + + # Blocks should be unchanged + assert enriched == sample_blocks + # No gene info should be added + for blocks_obj in enriched: + for block in blocks_obj["blocks"]: + assert "queryGeneName" not in block + + async def test_enrichment_handles_grpc_objects( + self, sample_query_gene_names, mock_genes_service + ): + """Test enrichment works with gRPC Block objects (not just dicts).""" + self.handler.genes_address = "localhost:8082" + + # Create mock gRPC-style blocks using a simple class to simulate protobuf objects + class MockGrpcBlock: + def __init__(self, i, j, fmin, fmax, orientation): + self.i = i + self.j = j + self.fmin = fmin + self.fmax = fmax + self.orientation = orientation + + grpc_blocks = [ + { + "chromosome": "chr1", + "genus": "Test", + "species": "species", + "blocks": [ + MockGrpcBlock(i=0, j=2, fmin=0, fmax=2999, orientation="+"), + ], + } + ] + + with patch( + "macro_synteny_blocks.request_handler.getGenes", new=mock_genes_service + ): + enriched = await self.handler._enrichBlocksWithGeneInfo( + grpc_blocks, sample_query_gene_names + ) + + # Check that enrichment worked with gRPC objects + block = enriched[0]["blocks"][0] + assert hasattr(block, "queryGeneName") + assert hasattr(block, "queryGeneFmin") + assert hasattr(block, "queryGeneFmax") + + async def test_enrichment_with_min_max_calculation( + self, sample_query_gene_names, mock_genes_service + ): + """Test that fmin uses min and fmax uses max when both terminal genes present.""" + self.handler.genes_address = "localhost:8082" + + # Block with i=0, j=2 (spans from gene1 at index 0 to gene3 at index 2) + blocks = [ + { + "chromosome": "chr1", + "genus": "Test", + "species": "species", + "blocks": [ + {"i": 0, "j": 2, "fmin": 0, "fmax": 2999, "orientation": "+"}, + ], + } + ] + + with patch( + "macro_synteny_blocks.request_handler.getGenes", new=mock_genes_service + ): + enriched = await self.handler._enrichBlocksWithGeneInfo( + blocks, sample_query_gene_names + ) + + block = enriched[0]["blocks"][0] + # gene1 (i=0): fmin=0, fmax=999 (from mock: gene_index=0, so 0*1000 to 0*1000+999) + # gene3 (j=2): fmin=2000, fmax=2999 (from mock: gene_index=2, so 2*1000 to 2*1000+999) + # queryGeneFmin should be min(0, 2000) = 0 + # queryGeneFmax should be max(999, 2999) = 2999 + assert block["queryGeneFmin"] == 0 + assert block["queryGeneFmax"] == 2999 + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestAddChromosomeLengths: + """Test chromosome length enrichment.""" + + async def test_add_chromosome_lengths(self, redis_with_chromosomes, sample_blocks): + """Test that chromosome lengths are added to Blocks objects.""" + handler = RequestHandler( + redis_connection=redis_with_chromosomes, pairwise_address="localhost:8081" + ) + + enriched = await handler._addChromosomeLengths(sample_blocks) + + # Check that chromosomeLength was added + for blocks_obj in enriched: + assert "chromosomeLength" in blocks_obj + chr_name = blocks_obj["chromosome"] + if chr_name == "chr1": + assert blocks_obj["chromosomeLength"] == 10000 + elif chr_name == "chr2": + assert blocks_obj["chromosomeLength"] == 8000 + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestProcessWithChromosomeName: + """Test the new processWithChromosomeName endpoint.""" + + async def test_process_with_chromosome_name_success( + self, + fakeredis_connection, + mock_chromosome_service, + mock_genes_service, + mock_pairwise_service, + ): + """Test successful processing with chromosome name.""" + handler = RequestHandler( + redis_connection=fakeredis_connection, + pairwise_address="localhost:8081", + chromosome_address="localhost:8082", + genes_address="localhost:8083", + ) + + # Setup chromosome data in Redis + await fakeredis_connection.hset( + "chromosome:chr1", + mapping={ + "name": "chr1", + "genus": "Test", + "species": "species", + "length": "10000", + }, + ) + + with patch( + "macro_synteny_blocks.request_handler.getChromosome", + new=mock_chromosome_service, + ): + with patch( + "macro_synteny_blocks.request_handler.getGenes", new=mock_genes_service + ): + with patch( + "macro_synteny_blocks.request_handler.computePairwiseMacroSyntenyBlocks", + new=mock_pairwise_service, + ): + with patch.object(handler, "_getTargets", return_value=["chr1"]): + result = await handler.processWithChromosomeName( + chromosome_name="test_chr", + matched=3, + intermediate=5, + mask=None, + targets=[], + metrics=[], + chromosome_genes=3, + chromosome_length=1, + ) + + # Should return enriched blocks + assert isinstance(result, list) + + async def test_process_with_chromosome_name_no_address(self, fakeredis_connection): + """Test that ValueError is raised when chromosome_address is None.""" + handler = RequestHandler( + redis_connection=fakeredis_connection, + pairwise_address="localhost:8081", + chromosome_address=None, # Not configured + ) + + with pytest.raises(ValueError, match="Chromosome address is not configured"): + await handler.processWithChromosomeName( + chromosome_name="test_chr", + matched=3, + intermediate=5, + mask=None, + targets=[], + metrics=[], + chromosome_genes=3, + chromosome_length=1, + ) + + async def test_process_with_chromosome_name_not_found( + self, fakeredis_connection, mock_chromosome_service + ): + """Test that ValueError is raised when chromosome is not found.""" + handler = RequestHandler( + redis_connection=fakeredis_connection, + pairwise_address="localhost:8081", + chromosome_address="localhost:8082", + ) + + async def mock_getChromosome_none(name, address): + return None + + with patch( + "macro_synteny_blocks.request_handler.getChromosome", + new=mock_getChromosome_none, + ): + with pytest.raises(ValueError, match="not found"): + await handler.processWithChromosomeName( + chromosome_name="nonexistent", + matched=3, + intermediate=5, + mask=None, + targets=[], + metrics=[], + chromosome_genes=3, + chromosome_length=1, + ) + + +@pytest.mark.unit +class TestGrpcBlockToDictBlock: + """Test conversion from gRPC blocks to dict blocks.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, pairwise_address="localhost:8081" + ) + + def test_basic_conversion(self): + """Test basic block conversion.""" + Block = namedtuple( + "Block", ["i", "j", "fmin", "fmax", "orientation", "optionalMetrics"] + ) + grpc_block = Block( + i=0, j=5, fmin=0, fmax=5000, orientation="+", optionalMetrics=[] + ) + + dict_block = self.handler._grpcBlockToDictBlock(grpc_block) + + assert dict_block["i"] == 0 + assert dict_block["j"] == 5 + assert dict_block["fmin"] == 0 + assert dict_block["fmax"] == 5000 + assert dict_block["orientation"] == "+" + + def test_conversion_with_metrics(self): + """Test conversion with optional metrics.""" + Block = namedtuple( + "Block", ["i", "j", "fmin", "fmax", "orientation", "optionalMetrics"] + ) + grpc_block = Block( + i=0, j=5, fmin=0, fmax=5000, orientation="+", optionalMetrics=[0.5, 0.8] + ) + + dict_block = self.handler._grpcBlockToDictBlock(grpc_block) + + assert "optionalMetrics" in dict_block + assert dict_block["optionalMetrics"] == [0.5, 0.8] diff --git a/macro_synteny_paf/DATA-SPEC.md b/macro_synteny_paf/DATA-SPEC.md new file mode 100644 index 00000000..7356175b --- /dev/null +++ b/macro_synteny_paf/DATA-SPEC.md @@ -0,0 +1,168 @@ +# Data Format +## PAF +https://github.com/lh3/miniasm/blob/master/PAF.md + +| Column | Name | Type | Description | +|--------|------|------|-------------| +| 1 | Query name | string | Query sequence (chromosome) name | +| 2 | Query length | int | Total length of query sequence | +| 3 | Query start | int | Query start position (0-based) | +| 4 | Query end | int | Query end position (0-based) | +| 5 | Strand | char | Relative strand: "+" or "-" | +| 6 | Target name | string | Target sequence (chromosome) name | +| 7 | Target length | int | Total length of target sequence | +| 8 | Target start | int | Target start position (0-based) | +| 9 | Target end | int | Target end position (0-based) | +| 10 | Matches | int | Number of residue matches | +| 11 | Block length | int | Alignment block length | +| 12 | Mapping quality | int | Mapping quality (0-255, 255=missing) | + +### Example + +``` +aradu.V14167.gnm1.chr01 110876686 1234567 1240000 + arahy.Tifrunner.gnm1.Arahy.01 119055080 2345678 2351111 1 1 255 +aradu.V14167.gnm1.chr01 110876686 5678901 5684000 - arahy.Tifrunner.gnm1.Arahy.02 118608362 3456789 3462789 1 1 255 +``` + +## JSON +```json + +{ + "type": "object", + "required": ["alignments"], + "properties": { + "alignments": { + "type": "array", + "items": { + "type": "object", + "required": [ + "query", + "target", + "strand", + "numResidueMatches", + "alignmentBlockLength", + "mappingQuality" + ], + "properties": { + "query": { + "type": "object", + "required": ["name", "length", "start", "end"], + "properties": { + "name": { + "type": "string", + "description": "Query name" + }, + "length": { + "type": "integer", + "minimum": 1, + "description": "Total length of query in base pairs" + }, + "start": { + "type": "integer", + "minimum": 0, + "description": "Alignment start position on query (0-based)" + }, + "end": { + "type": "integer", + "minimum": 0, + "description": "Alignment end position on query (0-based)" + } + } + }, + "target": { + "type": "object", + "required": ["name", "length", "start", "end"], + "properties": { + "name": { + "type": "string", + "description": "Target name" + }, + "length": { + "type": "integer", + "minimum": 1, + "description": "Total length of target in base pairs" + }, + "start": { + "type": "integer", + "minimum": 0, + "description": "Alignment start position on target (0-based)" + }, + "end": { + "type": "integer", + "minimum": 0, + "description": "Alignment end position on target (0-based)" + } + } + }, + "strand": { + "type": "string", + "enum": ["+", "-"], + "description": "Relative strand orientation" + }, + "numResidueMatches": { + "type": "integer", + "minimum": 0, + "description": "Number of matching residues in alignment" + }, + "alignmentBlockLength": { + "type": "integer", + "minimum": 0, + "description": "Length of alignment block" + }, + "mappingQuality": { + "type": "integer", + "minimum": 0, + "maximum": 255, + "description": "Mapping quality score (255 indicates missing/unavailable)" + } + } + } + } + } +} +``` + +### Example + +```json +{ + "alignments": [ + { + "query": { + "name": "aradu.V14167.gnm1.chr01", + "length": 110876686, + "start": 1234567, + "end": 1240000 + }, + "target": { + "name": "arahy.Tifrunner.gnm1.Arahy.01", + "length": 119055080, + "start": 2345678, + "end": 2351111 + }, + "strand": "+", + "numResidueMatches": 1, + "alignmentBlockLength": 1, + "mappingQuality": 255 + }, + { + "query": { + "name": "aradu.V14167.gnm1.chr01", + "length": 110876686, + "start": 5678901, + "end": 5684000 + }, + "target": { + "name": "arahy.Tifrunner.gnm1.Arahy.02", + "length": 118608362, + "start": 3456789, + "end": 3462789 + }, + "strand": "-", + "numResidueMatches": 1, + "alignmentBlockLength": 1, + "mappingQuality": 255 + } + ] +} +``` diff --git a/macro_synteny_paf/Dockerfile b/macro_synteny_paf/Dockerfile new file mode 100644 index 00000000..9da980a4 --- /dev/null +++ b/macro_synteny_paf/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.13.7-slim-trixie + +# install gcc and other build requirements +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY INSTALL ./ +COPY LICENSE ./ +COPY MANIFEST.in ./ +COPY README.md ./ +COPY setup.cfg ./ +COPY setup.py ./ +COPY requirements.txt ./ +COPY macro_synteny_paf/ ./macro_synteny_paf/ +COPY proto/ ./proto/ + +# install the package dependencies +RUN pip3 install --no-cache-dir -r requirements.txt + +# install (and implicitly build) the package +RUN pip3 install --no-cache-dir . + +WORKDIR / + +ENTRYPOINT ["python3", "-u", "-m", "macro_synteny_paf"] diff --git a/macro_synteny_paf/Dockerfile.test b/macro_synteny_paf/Dockerfile.test new file mode 100644 index 00000000..7466302b --- /dev/null +++ b/macro_synteny_paf/Dockerfile.test @@ -0,0 +1,35 @@ +FROM python:3.13.7-slim-trixie + +# install gcc and other build requirements +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY INSTALL ./ +COPY LICENSE ./ +COPY MANIFEST.in ./ +COPY README.md ./ +COPY setup.cfg ./ +COPY setup.py ./ +COPY requirements.txt ./ +COPY requirements-test.txt ./ +COPY macro_synteny_paf/ ./macro_synteny_paf/ +COPY proto/ ./proto/ +COPY tests/ ./tests/ +COPY pytest.ini ./ + +# install the package dependencies +RUN pip3 install --no-cache-dir -r requirements.txt + +# install test dependencies +RUN pip3 install --no-cache-dir -r requirements-test.txt + +# install (and implicitly build) the package +RUN pip3 install --no-cache-dir . + +# CRITICAL: Remove source directory to ensure tests use installed package +RUN rm -rf macro_synteny_paf/ + +CMD ["pytest", "-v", "--tb=short"] diff --git a/macro_synteny_paf/INSTALL b/macro_synteny_paf/INSTALL new file mode 100644 index 00000000..fbaeeb91 --- /dev/null +++ b/macro_synteny_paf/INSTALL @@ -0,0 +1,6 @@ + +Please use + python setup.py install + +and report errors to Alan Cleary (acleary@ncgr.org) + diff --git a/macro_synteny_paf/LICENSE b/macro_synteny_paf/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/macro_synteny_paf/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/macro_synteny_paf/MANIFEST.in b/macro_synteny_paf/MANIFEST.in new file mode 100644 index 00000000..ddff3452 --- /dev/null +++ b/macro_synteny_paf/MANIFEST.in @@ -0,0 +1,5 @@ +include INSTALL +include LICENSE +include MANIFEST.in +include *.md +recursive-include proto/ *.proto diff --git a/macro_synteny_paf/README.md b/macro_synteny_paf/README.md new file mode 100644 index 00000000..28a3fe2c --- /dev/null +++ b/macro_synteny_paf/README.md @@ -0,0 +1,56 @@ +# Macro-Synteny PAF Microservice + +This directory contains the macro-synteny-paf microservice. +This microservice takes two genome prefixes, and returns a set of synteny blocks in [PAF format](https://github.com/lh3/miniasm/blob/master/PAF.md). +The minimum number of matching annotations in a block and the maximum number of intermediate genes between any two matches in a block must also be provided. + +## Setup + +We assume you have already setup Redis with RediSearch and populated it with data from a PostgreSQL database configured with the Chado schema. +See the `../../database/README.md` file for instructions on how to do this. + +The easiest way to run the microservice is with a [Python Virtual Environment](http://docs.python-guide.org/en/latest/dev/virtualenvs/). +Once Python virtual environments is installed, you can create a virtual environment as follows + + $ virtualenv venv + +All the microservice's dependencies are listed in the `requirements.txt` file, which can be used to bootstrap the virtual environment as follows + + $ . ./venv/bin/activate + (venv) $ pip install -r requirements.txt + +## Running + +The microservice loads data from a RediSearch index and hosts an HTTP and a gRPC server. +The credentials for the microservice can be set via command line flags or via environment variables. +The RediSearch database credentials can be provided via the `REDIS_DB`, `REDIS_PASSWORD`, `REDIS_HOST`, and `REDIS_PORT` environment variables. +The HTTP server credentials can be provided via the `HTTP_HOST` and `HTTP_PORT` environment variables. +And the gRPC server credentials can be provided via the `GRPC_HOST` and `GRPC_PORT` environment variables. + +Run the microservice as follows + + (venv) $ ./microservice.py + +For more information about the microservice, run + + (venv) $ ./microservice.py --help + +## Use + +The microservice can be queried via HTTP GET or [TODO] gRPC. + +The default request URL is `localhost:8080/macro-synteny-paf`. + +The following is an example HTTP GET URL: + + localhost:8080/macro-synteny-paf?genome1=aradu.V14167.gnm1&genome2=arahy.Tifrunner.gnm1&matched=10&intermediate=5&mask=20 + +where + + genome1: query genome prefix + genome2: target genome prefix + matched: minimum number of matching annotations in a block + intermediate: maximum number of intermediate genes between any two matches in a block + mask: (optional) + +See the `macrosyntenypaf.proto` file and its auto-generated stubs for gRPC requests. diff --git a/macro_synteny_paf/compose.test.yml b/macro_synteny_paf/compose.test.yml new file mode 100644 index 00000000..26ddc4f6 --- /dev/null +++ b/macro_synteny_paf/compose.test.yml @@ -0,0 +1,43 @@ +# This file overrides services from +# github.com/legumeinfo/gcv-docker-compose compose.yml and compose.dev.yml +# +# Please use the convenience script: +# ./run-tests.sh + +services: + # Override redis to load test data + redis: + volumes: + # Load test data from dump.rdb + # TEST_DATA_DIR is set by run-tests.sh + - ${TEST_DATA_DIR}/tests/data/dump.rdb:/data/dump.rdb:ro + environment: + REDIS_ARGS: "" # Enable loading from dump.rdb + REDISEARCH_ARGS: "MAXSEARCHRESULTS 100000" + + # Add the test container + macro_synteny_paf_test: + build: + # MACRO_SYNTENY_PAF_DIR is set by run-tests.sh + context: ${MACRO_SYNTENY_PAF_DIR:-.} + dockerfile: Dockerfile.test + depends_on: + redis: + condition: service_healthy + chromosome: + condition: service_started + genes: + condition: service_started + macro_synteny_blocks: + condition: service_started + environment: + # Service connection configuration + REDIS_HOST: redis + REDIS_PORT: "6379" + CHROMOSOME_ADDR: chromosome:81 + GENES_ADDR: genes:81 + MACRO_SYNTENY_BLOCKS_ADDR: macro_synteny_blocks:81 + volumes: + # Mount test results for coverage reports + - ${MACRO_SYNTENY_PAF_DIR:-.}/htmlcov:/app/htmlcov:rw + profiles: [] # Always run, no profile required diff --git a/macro_synteny_paf/macro_synteny_paf/__init__.py b/macro_synteny_paf/macro_synteny_paf/__init__.py new file mode 100644 index 00000000..3934e508 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/__init__.py @@ -0,0 +1,12 @@ +def int_or_str(value): + try: + return int(value) + except ValueError: + return value + + +__version__ = "1.3.3" +VERSION = tuple(map(int_or_str, __version__.split("."))) + +__schema_version__ = "1.1.0" +SCHEMA_VERSION = tuple(map(int_or_str, __schema_version__.split("."))) diff --git a/macro_synteny_paf/macro_synteny_paf/__main__.py b/macro_synteny_paf/macro_synteny_paf/__main__.py new file mode 100755 index 00000000..dafeb18d --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/__main__.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python + +# Python +import argparse +import asyncio +import logging +import os +import signal + +# dependencies +import uvloop + +# module +import macro_synteny_paf +from macro_synteny_paf.database import connectToRedis +from macro_synteny_paf.grpc_server import run_grpc_server +from macro_synteny_paf.http_server import run_http_server +from macro_synteny_paf.request_handler import RequestHandler + +LOG_LEVELS = { + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, + "CRITICAL": logging.CRITICAL, +} + + +# a class that loads argument values from command line variables, resulting in a +# value priority: command line > environment variable > default value +class EnvArg(argparse.Action): + def __init__(self, envvar, required=False, default=None, **kwargs): + if envvar in os.environ: + default = os.environ[envvar] + if required and default is not None: + required = False + super(EnvArg, self).__init__(default=default, required=required, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + +def parseArgs(): + # create the parser + parser = argparse.ArgumentParser( + prog=macro_synteny_paf.__name__, + description=""" + A microservice for finding chromosome names similar to the given query. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--version", + action="version", + version=f""" + %(prog)s {macro_synteny_paf.__version__} schema + {macro_synteny_paf.__schema_version__} + """, + ) + + # logging args + loglevel_envvar = "LOG_LEVEL" + parser.add_argument( + "--log-level", + dest="log_level", + action=EnvArg, + envvar=loglevel_envvar, + type=str, + choices=list(LOG_LEVELS.keys()), + default="WARNING", + help=f""" + What level of events should be logged (can also be specified using the + {loglevel_envvar} environment variable). + """, + ) + logfile_envvar = "LOG_FILE" + parser.add_argument( + "--log-file", + dest="log_file", + action=EnvArg, + default=argparse.SUPPRESS, # removes "(default: None)" from help text + envvar=logfile_envvar, + type=str, + help=f""" + The file events should be logged in (can also be specified using the + {logfile_envvar} environment variable). + """, + ) + + # Async HTTP args + parser.add_argument( + "--no-http", + dest="nohttp", + action="store_true", + help="Don't run the HTTP server.", + ) + parser.set_defaults(nohttp=False) + hhost_envvar = "HTTP_HOST" + parser.add_argument( + "--hhost", + action=EnvArg, + envvar=hhost_envvar, + type=str, + default="localhost", + help=f""" + The HTTP server host (can also be specified using the {hhost_envvar} environment + variable). + """, + ) + hport_envvar = "HTTP_PORT" + parser.add_argument( + "--hport", + action=EnvArg, + envvar=hport_envvar, + type=str, + default="8080", + help=f""" + The HTTP server port (can also be specified using the {hport_envvar} environment + variable). + """, + ) + + # gRPC args + parser.add_argument( + "--no-grpc", + dest="nogrpc", + action="store_true", + help="Don't run the gRPC server.", + ) + parser.set_defaults(nogrpc=False) + ghost_envvar = "GRPC_HOST" + parser.add_argument( + "--ghost", + action=EnvArg, + envvar=ghost_envvar, + type=str, + default="[::]", + help=f""" + The gRPC server host (can also be specified using the {ghost_envvar} environment + variable). + """, + ) + gport_envvar = "GRPC_PORT" + parser.add_argument( + "--gport", + action=EnvArg, + envvar=gport_envvar, + type=str, + default="8081", + help=f""" + The gRPC server port (can also be specified using the {gport_envvar} environment + variable). + """, + ) + + # Redis args + rdb_envvar = "REDIS_DB" + parser.add_argument( + "--rdb", + action=EnvArg, + envvar=rdb_envvar, + type=int, + default=0, + help=f""" + The Redis database (can also be specified using the {rdb_envvar} environment + variable). + """, + ) + rpassword_envvar = "REDIS_PASSWORD" + parser.add_argument( + "--rpassword", + action=EnvArg, + envvar=rpassword_envvar, + type=str, + help=f""" + The Redis password (can also be specified using the {rpassword_envvar} + environment variable). + """, + ) + rhost_envvar = "REDIS_HOST" + parser.add_argument( + "--rhost", + action=EnvArg, + envvar=rhost_envvar, + type=str, + default="localhost", + help=f""" + The Redis host (can also be specified using the {rhost_envvar} environment + variable). + """, + ) + rport_envvar = "REDIS_PORT" + parser.add_argument( + "--rport", + action=EnvArg, + envvar=rport_envvar, + type=int, + default=6379, + help=f""" + The Redis port (can also be specified using the {rport_envvar} environment + variable). + """, + ) + + # Inter-microservice communication args + chromosomeaddr_envvar = "CHROMOSOME_ADDR" + parser.add_argument( + "--chromosomeaddr", + action=EnvArg, + envvar=chromosomeaddr_envvar, + type=str, + required=True, + help=f""" + The address of the chromosome microservice (can also be specified + using the {chromosomeaddr_envvar} environment variable). + """, + ) + genesaddr_envvar = "GENES_ADDR" + parser.add_argument( + "--genesaddr", + action=EnvArg, + envvar=genesaddr_envvar, + type=str, + required=True, + help=f""" + The address of the genes microservice (can also be specified + using the {genesaddr_envvar} environment variable). + """, + ) + macrosyntenyblocksaddr_envvar = "MACRO_SYNTENY_BLOCKS_ADDR" + parser.add_argument( + "--macrosyntenyblocksaddr", + action=EnvArg, + envvar=macrosyntenyblocksaddr_envvar, + type=str, + required=True, + help=f""" + The address of the macro-synteny-blocks microservice (can also be specified + using the {macrosyntenyblocksaddr_envvar} environment variable). + """, + ) + + # Caching args + cache_enabled_envvar = "PAF_CACHE_ENABLED" + parser.add_argument( + "--cache-enabled", + dest="cache_enabled", + action=EnvArg, + envvar=cache_enabled_envvar, + type=lambda x: x.lower() in ("true", "1", "yes"), + default=True, + help=f""" + Enable result caching in Redis (can also be specified using the + {cache_enabled_envvar} environment variable). Default: true + """, + ) + cache_ttl_envvar = "PAF_CACHE_TTL" + parser.add_argument( + "--cache-ttl", + dest="cache_ttl", + action=EnvArg, + envvar=cache_ttl_envvar, + type=int, + default=86400, + help=f""" + Cache TTL in seconds (can also be specified using the {cache_ttl_envvar} + environment variable). Default: 86400 (24 hours) + """, + ) + + return parser.parse_args() + + +# graceful shutdown +async def shutdown(loop, signal=None): + # report what signal (if any) initiated the shutdown + if signal: + logging.info(f"Received exit signal {signal.name}") + # cancel all running tasks (they know how to cleanup themselves) + logging.info("Cancelling outstanding tasks") + tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] + [task.cancel() for task in tasks] + await asyncio.gather(*tasks, return_exceptions=True) + # stop the asyncio loop + loop.stop() + + +# the asyncio exception handler that will initiate a shutdown +def handleException(loop, context): + msg = context.get("exception", context["message"]) + logging.critical(f"Caught exception: {msg}") + logging.info("Shutting down") + asyncio.create_task(shutdown(loop)) + + +def main(): + # parse the command line arguments / environment variables + args = parseArgs() + if args.nohttp and args.nogrpc: + exit("--no-http and --no-grpc can't both be given") + + # setup logging + log_config = { + "format": "%(asctime)s,%(msecs)d %(levelname)s: %(message)s", + "datefmt": "%H:%M:%S", + "level": LOG_LEVELS[args.log_level], + } + if "log_file" in args: + log_config["filename"] = args.log_file + logging.basicConfig(**log_config) + + # initialize asyncio + loop = uvloop.new_event_loop() + asyncio.set_event_loop(loop) + + # setup asyncio exception handling + signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT) + for s in signals: + loop.add_signal_handler( + s, lambda s=s: loop.create_task(shutdown(loop, signal=s)) + ) + loop.set_exception_handler(handleException) + + # run the program + try: + # create the database connection + redis_connection = loop.run_until_complete( + connectToRedis(args.rhost, args.rport, args.rdb, args.rpassword) + ) + # create the request handler + handler = RequestHandler( + redis_connection, + args.chromosomeaddr, + args.genesaddr, + args.macrosyntenyblocksaddr, + cache_enabled=args.cache_enabled, + cache_ttl=args.cache_ttl, + ) + # start the HTTP server + if not args.nohttp: + loop.create_task(run_http_server(args.hhost, args.hport, handler)) + # start the gRPC server + if not args.nogrpc: + loop.create_task(run_grpc_server(args.ghost, args.gport, handler)) + # run the main loop + loop.run_forever() + # catch exceptions not handled by asyncio + except Exception as e: + context = {"exception": e, "message": str(e)} + loop.call_exception_handler(context) + # finalize the shutdown + finally: + loop.close() + logging.info("Successfully shutdown.") + + +if __name__ == "__main__": + main() diff --git a/macro_synteny_paf/macro_synteny_paf/aioredisearch.py b/macro_synteny_paf/macro_synteny_paf/aioredisearch.py new file mode 100644 index 00000000..fa7adecc --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/aioredisearch.py @@ -0,0 +1,36 @@ +# Python +import time + +# dependencies +from redis.asyncio.client import Pipeline as AsyncPipeline +from redis.client import Pipeline +from redis.commands.search import AsyncSearch +from redis.commands.search.commands import SEARCH_CMD +from redis.commands.search.result import Result + + +# a class that overrides a subset of the RediSearch Client methods to be +# asynchronous +class CustomAsyncSearch(AsyncSearch): + # a copy of the RediSearch search command's inline Result instantiation so we + # can process results from searches made with a Redis Pipeline + def search_result(self, query, res, st=time.time()): + return Result( + res, + not query._no_content, + duration=(time.time() - st) * 1000.0, + has_payload=query._with_payloads, + with_scores=query._with_scores, + ) + + # a copy of the RediSearch search command that checks for the async pipeline; + # I've opened an issue in the redis-py repo since this is a bug: + # https://github.com/redis/redis-py/issues/2279 + async def search(self, query, query_params=None): + args, query = self._mk_query_args(query, query_params=query_params) + st = time.time() + res = self.execute_command(SEARCH_CMD, *args) + + if isinstance(res, Pipeline) or isinstance(res, AsyncPipeline): + return res + return self.search_result(query, res, st) diff --git a/macro_synteny_paf/macro_synteny_paf/commands.py b/macro_synteny_paf/macro_synteny_paf/commands.py new file mode 100644 index 00000000..b04098e6 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/commands.py @@ -0,0 +1,80 @@ +# this file is a modification of: +# https://github.com/grpc/grpc/blob/2231c2ba77cf22f3c8c302d91209c1c3f2f0632f/tools/distrib/python/grpcio_tools/grpc_tools/command.py + +# here is an example of how it should be used: +# https://github.com/grpc/grpc/blob/fd3bd70939fb4239639fbd26143ec416366e4157/src/python/grpcio_health_checking/health_commands.py + +# Copyright 2015 gRPC authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Provides distutils command classes for the macro-synteny-paf Python setup process. +""" + + +import os +import sys +from importlib.resources import files as resource_files + +import setuptools + + +class BuildProtos(setuptools.Command): + """Command to generate project *_pb2.py modules from proto files.""" + + description = "build grpc protobuf modules" + user_options = [ + ("strict-mode", "s", "exit with non-zero value if the proto compiling fails.") + ] + + def initialize_options(self): + self.strict_mode = False + self.build_lib = None + self.proto_dir = None + self.proto_build_dir = None + + def finalize_options(self): + self.set_undefined_options("build", ("build_lib", "build_lib")) + package_root = self.distribution.package_dir[""] + self.proto_dir = os.path.abspath(os.path.join(package_root, "proto")) + self.proto_build_dir = os.path.abspath( + os.path.join(self.build_lib, "macro_synteny_paf/proto") + ) + + def run(self): + self.build_package_protos() + + def build_package_protos(self): + from grpc_tools import protoc + + proto_files = [] + for root, _, files in os.walk(self.proto_dir): + for filename in files: + if filename.endswith(".proto"): + proto_files.append(os.path.abspath(os.path.join(root, filename))) + + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) + + for proto_file in proto_files: + command = [ + "grpc_tools.protoc", + "--proto_path={}".format(self.proto_dir), + "--proto_path={}".format(well_known_protos_include), + "--python_out={}".format(self.proto_build_dir), + "--grpc_python_out={}".format(self.proto_build_dir), + ] + [proto_file] + if protoc.main(command) != 0: + if self.strict_mode: + raise Exception("error: {} failed".format(command)) + else: + sys.stderr.write("warning: {} failed".format(command)) diff --git a/macro_synteny_paf/macro_synteny_paf/database.py b/macro_synteny_paf/macro_synteny_paf/database.py new file mode 100644 index 00000000..ee3c4a44 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/database.py @@ -0,0 +1,35 @@ +# dependencies +import redis.asyncio as redis + +# module +import macro_synteny_paf + +COMPATIBLE_KEY = "GCV_COMPATIBLE_SCHEMA_VERSIONS" + + +class SchemaVersionError(Exception): + """ + The exception to raise when the GCV database schema doesn't support the schema + required by the service. + """ + + pass + + +async def connectToRedis(host="localhost", port=6379, db=0, password=None): + # connect to database + connection = await redis.Redis( + host=host, port=port, db=db, password=password, decode_responses=True + ) + # ping to force connection, preventing errors downstream + await connection.ping() + # check that the database is loaded with a compatible schema version + if not await connection.sismember( + COMPATIBLE_KEY, macro_synteny_paf.__schema_version__ + ): + message = ( + "The Redis database does not support the required GCV schema " + f"version: {macro_synteny_paf.__schema_version__}" + ) + raise SchemaVersionError(message) + return connection diff --git a/macro_synteny_paf/macro_synteny_paf/grpc_client.py b/macro_synteny_paf/macro_synteny_paf/grpc_client.py new file mode 100644 index 00000000..54afe030 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/grpc_client.py @@ -0,0 +1,176 @@ +# Python +import logging + +# isort: split + +# dependencies +from grpc.experimental import aio + +# isort: split + +# module +# isort: off +# from macro_synteny_paf.proto.genes_service.v1 +# import genes_pb2 +# from macro_synteny_paf.proto.genes_service.v1 +# import genes_pb2_grpc +# from macro_synteny_paf.proto.chromosome_service.v1 +# import chromosome_pb2 +# from macro_synteny_paf.proto.chromosome_service.v1 +# import chromosome_pb2_grpc +# from macro_synteny_paf.proto.macrosyntenyblocks_service.v1 +# import macrosyntenyblocks_pb2 +# from macro_synteny_paf.proto.macrosyntenyblocks_service.v1 +# import macrosyntenyblocks_pb2_grpc +# NOTE: the following imports are a temporary workaround for a known protobuf +# bug; the commented imports above should be used when the bug is fixed: +# https://github.com/protocolbuffers/protobuf/issues/10075 +from macro_synteny_paf import proto # noqa: F401 +from genes_service.v1 import genes_pb2, genes_pb2_grpc +from chromosome_service.v1 import chromosome_pb2, chromosome_pb2_grpc +from macrosyntenyblocks_service.v1 import ( + macrosyntenyblocks_pb2, + macrosyntenyblocks_pb2_grpc, +) + +# isort: on + + +async def getGenes( + gene_names, + address, +): + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = genes_pb2_grpc.GenesStub(channel) + try: + result = await stub.Get( + genes_pb2.GenesGetRequest( + names=gene_names, + ) + ) + return result.genes + except Exception as e: + logging.error(e) + return None + + +async def getChromosome( + chromosome_name, + address, +): + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = chromosome_pb2_grpc.ChromosomeStub(channel) + try: + result = await stub.Get( + chromosome_pb2.ChromosomeGetRequest( + name=chromosome_name, + ) + ) + return result.chromosome + except Exception as e: + logging.error(e) + return None + + +async def getChromosomeLength( + chromosome_name, + address, +): + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = chromosome_pb2_grpc.ChromosomeStub(channel) + try: + result = await stub.Get( + chromosome_pb2.ChromosomeGetRequest( + name=chromosome_name, + ) + ) + return result.chromosome.length + except Exception as e: + logging.error(e) + return None + + +async def computeMacroSyntenyBlocks( + chromosome, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + address, + identity=None, + correspondences=None, +): + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = macrosyntenyblocks_pb2_grpc.MacroSyntenyBlocksStub(channel) + try: + result = await stub.Compute( + macrosyntenyblocks_pb2.MacroSyntenyBlocksComputeRequest( + chromosome=chromosome, + matched=matched, + intermediate=intermediate, + mask=mask, + targets=targets, + optionalMetrics=metrics, + chromosomeGenes=chromosome_genes, + chromosomeLength=chromosome_length, + identity=identity, + correspondences=correspondences, + ) + ) + return result.blocks + except Exception as e: + logging.error(e) + return None + + +async def computeMacroSyntenyBlocksByChromosome( + chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + address, + identity=None, + correspondences=None, +): + """ + Compute macro synteny blocks using chromosome name instead of gene families. + Uses the new ComputeByChromosome endpoint in macro-synteny-blocks service. + """ + # fetch channel every time to support dynamic services + channel = aio.insecure_channel(address) + await channel.channel_ready() + stub = macrosyntenyblocks_pb2_grpc.MacroSyntenyBlocksStub(channel) + try: + result = await stub.ComputeByChromosome( + macrosyntenyblocks_pb2.MacroSyntenyBlocksComputeByChromosomeRequest( + chromosomeName=chromosome_name, + matched=matched, + intermediate=intermediate, + mask=mask, + targets=targets, + optionalMetrics=metrics, + chromosomeGenes=chromosome_genes, + chromosomeLength=chromosome_length, + identity=identity, + correspondences=correspondences, + ) + ) + return result.blocks + except Exception as e: + logging.error(e) + return None diff --git a/macro_synteny_paf/macro_synteny_paf/grpc_server.py b/macro_synteny_paf/macro_synteny_paf/grpc_server.py new file mode 100644 index 00000000..6285f2ff --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/grpc_server.py @@ -0,0 +1,110 @@ +# dependencies +import grpc +from grpc.experimental import aio + +# isort: split + +# module +# isort: off +# from macro_synteny_paf.proto.macrosyntenypaf_service.v1 +# import macrosyntenypaf_pb2 +# from macro_synteny_paf.proto.macrosyntenypaf_service.v1 +# import macrosyntenypaf_pb2_grpc +# NOTE: the following imports are a temporary workaround for a known protobuf +# bug; the commented imports above should be used when the bug is fixed: +# https://github.com/protocolbuffers/protobuf/issues/10075 +from macro_synteny_paf import proto # noqa: F401 +from macrosyntenypaf_service.v1 import ( + macrosyntenypaf_pb2, + macrosyntenypaf_pb2_grpc, +) + +# isort: on + + +class MacroSyntenyPaf(macrosyntenypaf_pb2_grpc.MacroSyntenyPafServicer): + def __init__(self, handler): + self.handler = handler + + # create a context done callback that raises the given exception + def _exceptionCallbackFactory(self, exception): + def exceptionCallback(call): + raise exception + + return exceptionCallback + + # the method that actually handles requests + async def _compute(self, request, context): + # required parameters + chromosome = request.chromosome + matched = request.matched + intermediate = request.intermediate + # optional parameters + mask = request.mask or None + targets = request.targets or None + metrics = request.optionalMetrics or None + chromosome_genes = request.chromosomeGenes or None + chromosome_length = request.chromosomeLength or None + try: + ( + chromosome, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + ) = self.handler.parseArguments( + chromosome, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + ) + except Exception: + # raise a gRPC INVALID ARGUMENT error + await context.abort( + grpc.StatusCode.INVALID_ARGUMENT, + "Required arguments are missing or given arguments have invalid values", + ) + blocks = await self.handler.process( + chromosome, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + ) + return macrosyntenypaf_pb2.MacroSyntenyPafComputeReply(blocks=blocks) + + # implements the service's API + async def Compute(self, request, context): + # subvert the gRPC exception handler via a try/except block + try: + return await self._compute(request, context) + # let errors we raised go by + except aio.AbortError as e: + raise e + # raise an internal error to prevent non-gRPC info from being sent to users + except Exception as e: + # raise the exception after aborting so it gets logged + # NOTE: gRPC docs says abort should raise an error but it doesn't... + context.add_done_callback(self._exceptionCallbackFactory(e)) + # return a gRPC INTERNAL error + await context.abort(grpc.StatusCode.INTERNAL, "Internal server error") + + +async def run_grpc_server(host, port, handler): + server = aio.server() + server.add_insecure_port(f"{host}:{port}") + servicer = MacroSyntenyPaf(handler) + macrosyntenypaf_pb2_grpc.add_MacroSyntenyPafServicer_to_server(servicer, server) + await server.start() + await server.wait_for_termination() + # TODO: what about teardown? server.stop(None) diff --git a/macro_synteny_paf/macro_synteny_paf/http_server.py b/macro_synteny_paf/macro_synteny_paf/http_server.py new file mode 100644 index 00000000..74fb6880 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/http_server.py @@ -0,0 +1,104 @@ +# dependencies +import json + +import aiohttp_cors +from aiohttp import web + + +async def http_get_handler(request): + # parse the query from the URL query string parameters + genome_1 = request.rel_url.query.get("genome1", "") + genome_2 = request.rel_url.query.get("genome2", "") + matched = request.rel_url.query.get("matched", "") + intermediate = request.rel_url.query.get("intermediate", "") + # optional parameters + mask = request.rel_url.query.get("mask", None) + format_type = request.rel_url.query.get("format", "json") # default to json + metrics_param = request.rel_url.query.get("metrics", None) + metrics = metrics_param.split(",") if metrics_param else None + identity = request.rel_url.query.get("identity", None) + anchors = request.rel_url.query.get("anchors", None) + chromosome_genes = None # data.get("chromosome_genes", None) + chromosome_length = None # data.get("chromosome_length", None) + handler = request.app["handler"] + + if format_type not in ["json", "paf"]: + return web.HTTPBadRequest( + text="Invalid format parameter. Must be 'json' or 'paf'." + ) + + try: + ( + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity, + anchors, + ) = handler.parseArguments( + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity, + anchors, + ) + except Exception: + return web.HTTPBadRequest( + text="Required arguments are missing or have invalid values" + ) + + result = await handler.process( + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity, + anchors, + grpc_decode=True, + output_format=format_type, + ) + + if format_type == "json": + return web.Response( + text=json.dumps(result, indent=2), content_type="application/json" + ) + else: + return web.Response(text=result, content_type="text/plain") + + +async def run_http_server(host, port, handler): + # make the app + app = web.Application() + app["handler"] = handler + # define the route and enable CORS + cors = aiohttp_cors.setup( + app, + defaults={ + "*": aiohttp_cors.ResourceOptions( + allow_credentials=True, + expose_headers="*", + allow_headers="*", + ) + }, + ) + route = app.router.add_get("/", http_get_handler) + cors.add(route) + # run the app + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, host, port) + await site.start() + # TODO: what about teardown? runner.cleanup() diff --git a/macro_synteny_paf/macro_synteny_paf/proto/__init__.py b/macro_synteny_paf/macro_synteny_paf/proto/__init__.py new file mode 100644 index 00000000..77d8d184 --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/proto/__init__.py @@ -0,0 +1,16 @@ +import os +import sys + +# protoc can't generate Python files with relative imports and Python 3 doesn't +# have implicit relative imports like Python 2. This is unfortunate because we +# don't want to add service-specific absolute import paths to .proto files +# because that'll break portability, i.e. whoever uses the proto files has to +# have the exact same file structure or they need to change the imports in the +# proto files to fit their use case. We could make the imports relative by +# modifying them in the generated files but this is finicky because it adds an +# extra build step that, if skipped/fails, could leave the developer perplexed. +# So instead here we hack this proto module into the System PATH so the +# generated files' absolute(ly wrong) imports work correctly... +# Google doesn't seem interested in adding support for relative imports: +# https://github.com/protocolbuffers/protobuf/issues/1491 +sys.path.append(os.path.dirname(__file__)) diff --git a/macro_synteny_paf/macro_synteny_paf/request_handler.py b/macro_synteny_paf/macro_synteny_paf/request_handler.py new file mode 100644 index 00000000..b193c8db --- /dev/null +++ b/macro_synteny_paf/macro_synteny_paf/request_handler.py @@ -0,0 +1,547 @@ +# Python +import asyncio +import hashlib +import json +import logging + +# dependencies +from redis.commands.search import AsyncSearch +from redis.commands.search.query import Query + +# module +from macro_synteny_paf.grpc_client import ( + computeMacroSyntenyBlocks, + computeMacroSyntenyBlocksByChromosome, + getChromosome, + getChromosomeLength, + getGenes, +) + + +class RequestHandler: + def __init__( + self, + redis_connection, + chromosome_address, + genes_address, + macrosyntenyblocks_address, + breakpoint_characters=",.<>{}[]\"':;!@#$%^&*()-+=~", + cache_enabled=True, + cache_ttl=86400, + ): + self.redis_connection = redis_connection + self.chromosome_address = chromosome_address + self.genes_address = genes_address + self.macrosyntenyblocks_address = macrosyntenyblocks_address + self.breakpoint_characters = set(breakpoint_characters) + self.cache_enabled = cache_enabled + self.cache_ttl = cache_ttl + + def parseArguments( + self, + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity=None, + anchors=None, + ): + if metrics is None: + metrics = [] + iter(metrics) # TypeError if not iterable + matched = int(matched) # ValueError + intermediate = int(intermediate) # ValueError + if chromosome_genes is None: + chromosome_genes = matched + else: + chromosome_genes = int(chromosome_genes) # ValueError + if chromosome_length is None: + chromosome_length = 1 + else: + chromosome_length = int(chromosome_length) # ValueError + if ( + matched <= 0 + or intermediate <= 0 + or chromosome_genes <= 0 + or chromosome_length <= 0 + ): + raise ValueError( + """ + matched, intermediate, chromosome genes, and chromosome length must be + positive + """ + ) + if mask is not None: + mask = int(mask) + if mask <= 0: + raise ValueError("mask must be positive") + # validate identity parameter + if identity is not None and identity not in ("levenshtein", "jaccard"): + raise ValueError('identity must be "levenshtein" or "jaccard"') + # validate anchors parameter + if anchors is not None and anchors not in ("simple", "regular"): + raise ValueError('anchors must be "simple" or "regular"') + return ( + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity, + anchors, + ) + + async def _getChromosomeNames( + self, + genome_prefix, + ): + # connect to the index + chromosome_index = AsyncSearch( + self.redis_connection, index_name="chromosomeIdx" + ) + # replace RediSearch breakpoint characters with spaces + cleaned_name = "" + for c in genome_prefix: + if c in self.breakpoint_characters: + cleaned_name += " " + else: + cleaned_name += c + # search the chromosome index + # first get a count + query = Query(cleaned_name).in_order().paging(0, 0) + result = await chromosome_index.search(query) + num_chromosomes = result.total + # then get the chromosomes + query = ( + Query(cleaned_name) + .in_order() + .limit_fields("name") + .return_fields("name") + .paging(0, num_chromosomes) + ) + result = await chromosome_index.search(query) + chromosome_names = list(map(lambda d: d.name, result.docs)) + return chromosome_names + + # returns the PAF row for a single macro-synteny block + async def _blockToPafRow( + self, + query_chromosome_name, + query_chromosome_length, + target_chromosome_name, + target_chromosome_length, + target_block, + # default values for PAF columns that are not available from the microservices + num_residue_matches=1, + alignment_block_length=1, + mapping_quality=255, # denotes 'missing' + ): + # Check if block has enriched gene information from macro-synteny-blocks + if hasattr(target_block, "queryGeneFmin") and target_block.queryGeneFmin: + # Use pre-fetched gene positions from enriched blocks + query_start = target_block.queryGeneFmin + query_end = target_block.queryGeneFmax + else: + # Fallback: get gene information from the genes microservice + # This path is used when macro-synteny-blocks doesn't have genes_address configured + gene_names = [list(query_chromosome.track.genes)[target_block.i]] + genes = await getGenes(gene_names, self.genes_address) + filtered_genes = list(filter(lambda d: d is not None, genes)) + # there should be only one match (index 0) + query_start = filtered_genes[0].fmin + query_end = filtered_genes[0].fmax + + # PAF format is defined here: https://github.com/lh3/miniasm/blob/master/PAF.md + paf_row = f"{query_chromosome_name}\t{query_chromosome_length}\t{query_start}\t{query_end}\t{target_block.orientation}\t{target_chromosome_name}\t{target_chromosome_length}\t{target_block.fmin}\t{target_block.fmax}\t{num_residue_matches}\t{alignment_block_length}\t{mapping_quality}" + + # Add optionalMetrics as PAF tag if present (om:B:f,value1,value2,...) + if hasattr(target_block, "optionalMetrics") and target_block.optionalMetrics: + metrics_str = ",".join(str(m) for m in target_block.optionalMetrics) + paf_row += f"\tom:B:f,{metrics_str}" + + return paf_row + "\n" + + # returns JSON object for a single macro-synteny block + async def _blockToJson( + self, + query_chromosome_name, + query_chromosome_length, + target_chromosome_name, + target_chromosome_length, + target_block, + # default values for PAF columns that are not available from the microservices + num_residue_matches=1, + alignment_block_length=1, + mapping_quality=255, # denotes 'missing' + ): + # Check if block has enriched gene information from macro-synteny-blocks + if hasattr(target_block, "queryGeneFmin") and target_block.queryGeneFmin: + query_start = target_block.queryGeneFmin + query_end = target_block.queryGeneFmax + else: + gene_names = [list(query_chromosome.track.genes)[target_block.i]] + genes = await getGenes(gene_names, self.genes_address) + filtered_genes = list(filter(lambda d: d is not None, genes)) + # there should be only one match (index 0) + query_start = filtered_genes[0].fmin + query_end = filtered_genes[0].fmax + + result = { + "query": { + "name": query_chromosome_name, + "length": query_chromosome_length, + "start": query_start, + "end": query_end, + }, + "target": { + "name": target_chromosome_name, + "length": target_chromosome_length, + "start": target_block.fmin, + "end": target_block.fmax, + }, + "strand": target_block.orientation, + "numResidueMatches": num_residue_matches, + "alignmentBlockLength": alignment_block_length, + "mappingQuality": mapping_quality, + } + # Include identity if present + if hasattr(target_block, "identity") and target_block.HasField("identity"): + result["identity"] = target_block.identity + # Include optionalMetrics if present + if hasattr(target_block, "optionalMetrics"): + metrics_list = list(target_block.optionalMetrics) + logging.debug( + f"Block has optionalMetrics: {metrics_list}, length: {len(metrics_list)}" + ) + if metrics_list: + result["optionalMetrics"] = metrics_list + else: + logging.debug( + f"Block does not have optionalMetrics attribute. Block attributes: {dir(target_block)}" + ) + return result + + # returns PAF rows for a target block object (containing multiple macro-synteny blocks) + async def _blocksToPafRows( + self, + query_chromosome_name, + query_chromosome_length, + target_block, + ): + # Check if target block has enriched chromosomeLength from macro-synteny-blocks + if hasattr(target_block, "chromosomeLength") and target_block.chromosomeLength: + # Use pre-fetched chromosome length from enriched blocks + target_chromosome_length = target_block.chromosomeLength + else: + # Fallback: get target chromosome length from the chromosome microservice + # This path is used when macro-synteny-blocks doesn't have enrichment enabled + target_chromosome_length = await getChromosomeLength( + target_block.chromosome, + self.chromosome_address, + ) + + paf_rows = await asyncio.gather( + *[ + # compute PAF rows for each target block + self._blockToPafRow( + query_chromosome_name, + query_chromosome_length, + target_block.chromosome, + target_chromosome_length, + tgt_block, + ) + for tgt_block in target_block.blocks + ] + ) + return "".join(paf_rows) + + # returns JSON array for a target block object (containing multiple macro-synteny blocks) + async def _blocksToJson( + self, + query_chromosome_name, + query_chromosome_length, + target_block, + query_assembly_name=None, + target_assembly_name=None, + anchors=None, + ): + # Check if target block has enriched chromosomeLength from macro-synteny-blocks + if hasattr(target_block, "chromosomeLength") and target_block.chromosomeLength: + target_chromosome_length = target_block.chromosomeLength + else: + target_chromosome_length = await getChromosomeLength( + target_block.chromosome, + self.chromosome_address, + ) + + json_objects = await asyncio.gather( + *[ + self._blockToJson( + query_chromosome_name, + query_chromosome_length, + target_block.chromosome, + target_chromosome_length, + tgt_block, + ) + for tgt_block in target_block.blocks + ] + ) + + # If regular anchors mode, extract correspondences as top-level JBrowse-compatible objects + if anchors == "regular": + jbrowse_objects = [] + for idx, tgt_block in enumerate(target_block.blocks): + if hasattr(tgt_block, "correspondences") and tgt_block.correspondences: + for corr_idx, corr in enumerate(tgt_block.correspondences): + # Skip self-identity correspondences (same position on query and target) + # These create "vertical lines" that obscure synteny relationships + if ( + corr.query_fmin == corr.target_fmin + and corr.query_fmax == corr.target_fmax + ): + continue + # Generate unique ID from chromosome names and coordinates + unique_id = f"{query_chromosome_name}:{corr.query_fmin}-{corr.query_fmax}_{target_block.chromosome}:{corr.target_fmin}-{corr.target_fmax}" + jbrowse_obj = { + "uniqueId": unique_id, + "refName": target_block.chromosome, + "start": corr.target_fmin, + "end": corr.target_fmax, + "assemblyName": target_assembly_name, + "strand": tgt_block.orientation, + "mate": { + "refName": query_chromosome_name, + "start": corr.query_fmin, + "end": corr.query_fmax, + "assemblyName": query_assembly_name, + }, + } + # Include identity if present on the block + if hasattr(tgt_block, "identity") and tgt_block.HasField( + "identity" + ): + jbrowse_obj["identity"] = tgt_block.identity + jbrowse_objects.append(jbrowse_obj) + return jbrowse_objects + + return json_objects + + def _generate_cache_key( + self, + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + output_format, + identity=None, + anchors=None, + ): + """ + Generate a deterministic cache key from request parameters. + + Returns: + str: A Redis key for caching this specific computation. + """ + # Convert metrics list to a stable string representation + metrics_str = ",".join(sorted(metrics)) if metrics else "" + identity_str = identity if identity else "" + anchors_str = anchors if anchors else "" + # Create a composite key from all parameters including format + key_components = ( + f"{genome_1}:{genome_2}:{matched}:{intermediate}:" + f"{mask}:{metrics_str}:{chromosome_genes}:{chromosome_length}:{output_format}:{identity_str}:{anchors_str}" + ) + # Hash to create a fixed-length key + hash_digest = hashlib.sha256(key_components.encode()).hexdigest() + # Use a versioned prefix to allow cache invalidation if format changes + return f"synteny_cache:v5:{hash_digest}" + + async def _computeResults( + self, + query_chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode, + output_format, + identity=None, + anchors=None, + query_assembly_name=None, + target_assembly_name=None, + ): + # Use the new ComputeByChromosome endpoint in macro-synteny-blocks + # This now returns enriched blocks with gene positions and chromosome lengths + # Request correspondences from backend when anchors="regular" + correspondences = anchors == "regular" + target_blocks = await computeMacroSyntenyBlocksByChromosome( + query_chromosome_name, + matched, + intermediate, + mask, + targets, + metrics, + chromosome_genes, + chromosome_length, + self.macrosyntenyblocks_address, + identity, + correspondences, + ) + # remove the targets that didn't return any blocks + filtered_target_blocks = list(filter(lambda b: b is not None, target_blocks)) + + # Get query chromosome length (still needed for both formats) + # NOTE: If blocks are enriched, we could optimize this by getting it from + # the chromosome service call inside macro-synteny-blocks, but that would + # require passing it back in the response + query_chromosome = await getChromosome( + query_chromosome_name, self.chromosome_address + ) + query_chromosome_length = query_chromosome.length + + if output_format == "paf": + # Return PAF format (tab-delimited text) + paf_rows = await asyncio.gather( + *[ + # compute PAF rows for each target block + self._blocksToPafRows( + query_chromosome_name, + query_chromosome_length, + target_block, + ) + for target_block in filtered_target_blocks + ] + ) + return "".join(paf_rows) + else: + # Return JSON format (list of alignment objects) + json_arrays = await asyncio.gather( + *[ + # compute JSON objects for each target block + self._blocksToJson( + query_chromosome_name, + query_chromosome_length, + target_block, + query_assembly_name, + target_assembly_name, + anchors=anchors, + ) + for target_block in filtered_target_blocks + ] + ) + # Flatten the list of lists into a single list + return [item for sublist in json_arrays for item in sublist] + + async def process( + self, + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + identity=None, + anchors=None, + grpc_decode=False, + output_format="json", + ): + cache_key = None + # Check cache first if caching is enabled + if self.cache_enabled: + cache_key = self._generate_cache_key( + genome_1, + genome_2, + matched, + intermediate, + mask, + metrics, + chromosome_genes, + chromosome_length, + output_format, + identity, + anchors, + ) + + try: + cached_result = await self.redis_connection.get(cache_key) + if cached_result: + try: + cached_json = json.loads(cached_result) + return cached_json + except json.JSONDecodeError: + return cached_result + except Exception as e: + # Log cache retrieval errors but continue with computation + # This ensures cache failures don't break the service + logging.warning(f"Cache retrieval failed for key {cache_key}: {e}") + + # Cache miss or caching disabled - compute the result + genome_1_chrs = await self._getChromosomeNames(genome_1) + genome_2_chrs = await self._getChromosomeNames(genome_2) + iter(genome_1_chrs) # TypeError if not iterable + iter(genome_2_chrs) # TypeError if not iterable + + results = await asyncio.gather( + *[ + # compute results for each target chromosome + self._computeResults( + chr1_name, + matched, + intermediate, + mask, + genome_2_chrs, + metrics, + chromosome_genes, + chromosome_length, + grpc_decode, + output_format, + identity, + anchors, + query_assembly_name=genome_1, + target_assembly_name=genome_2, + ) + for chr1_name in genome_1_chrs + ] + ) + + # Combine results based on format + if output_format == "paf": + result = "".join(results) + else: + all_alignments = [item for sublist in results for item in sublist] + result = {"alignments": all_alignments} + + # Store result in cache if caching is enabled + if self.cache_enabled and cache_key is not None: + try: + # For JSON, serialize before caching + cache_value = result + if output_format == "json": + cache_value = json.dumps(result) + + # Store with TTL + await self.redis_connection.setex( + cache_key, self.cache_ttl, cache_value + ) + except Exception as e: + # Log cache storage errors but don't fail the request + # The computation succeeded, cache failure is non-critical + logging.warning(f"Cache storage failed for key {cache_key}: {e}") + + return result diff --git a/macro_synteny_paf/proto/block/v1/block.proto b/macro_synteny_paf/proto/block/v1/block.proto new file mode 100644 index 00000000..27f03bee --- /dev/null +++ b/macro_synteny_paf/proto/block/v1/block.proto @@ -0,0 +1,35 @@ +syntax = "proto3"; + +package legumeinfo.microservices.block.v1; + + +message Blocks { + string chromosome = 1; + string genus = 2; + string species = 3; + repeated Block blocks = 4; + optional uint32 chromosomeLength = 5; // Length of target chromosome in base pairs +} + +message Correspondence { + uint32 query_index = 1; + uint32 target_index = 2; + uint32 target_fmin = 3; // Target gene start position (bp) + uint32 target_fmax = 4; // Target gene end position (bp) + optional uint32 query_fmin = 5; // Query gene start position (bp), if enriched + optional uint32 query_fmax = 6; // Query gene end position (bp), if enriched +} + +message Block { + uint32 i = 1; + uint32 j = 2; + uint32 fmin = 3; + uint32 fmax = 4; + string orientation = 5; + repeated float optionalMetrics = 6; + optional string queryGeneName = 7; // Name of query gene at index i + optional uint32 queryGeneFmin = 8; // Query gene start position + optional uint32 queryGeneFmax = 9; // Query gene end position + optional float identity = 10; // Identity score (0.0-1.0) if requested + repeated Correspondence correspondences = 11; // Gene pair correspondences within the block +} diff --git a/macro_synteny_paf/proto/chromosome_service/v1/chromosome.proto b/macro_synteny_paf/proto/chromosome_service/v1/chromosome.proto new file mode 100644 index 00000000..67abc3a2 --- /dev/null +++ b/macro_synteny_paf/proto/chromosome_service/v1/chromosome.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +import "track/v1/track.proto"; + +package legumeinfo.microservices.chromosome_service.v1; + + +service Chromosome { + rpc Get (ChromosomeGetRequest) returns (ChromosomeGetReply) {} +} + + +message ChromosomeGetRequest { + string name = 1; +} + + +message ChromosomeGetReply { + legumeinfo.microservices.track.v1.Chromosome chromosome = 1; +} diff --git a/macro_synteny_paf/proto/gene/v1/gene.proto b/macro_synteny_paf/proto/gene/v1/gene.proto new file mode 100644 index 00000000..ad281da9 --- /dev/null +++ b/macro_synteny_paf/proto/gene/v1/gene.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; + +package legumeinfo.microservices.gene.v1; + + +message Gene { + string name = 1; + uint32 fmin = 2; + uint32 fmax = 3; + int32 strand = 4; + string family = 5; + string chromosome = 6; +} diff --git a/macro_synteny_paf/proto/genes_service/v1/genes.proto b/macro_synteny_paf/proto/genes_service/v1/genes.proto new file mode 100644 index 00000000..a1145025 --- /dev/null +++ b/macro_synteny_paf/proto/genes_service/v1/genes.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +import "gene/v1/gene.proto"; + +package legumeinfo.microservices.genes_service.v1; + + +service Genes { + rpc Get (GenesGetRequest) returns (GenesGetReply) {} +} + + +message GenesGetRequest { + repeated string names = 1; +} + + +message GenesGetReply { + repeated legumeinfo.microservices.gene.v1.Gene genes = 1; +} diff --git a/macro_synteny_paf/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto b/macro_synteny_paf/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto new file mode 100644 index 00000000..000e2825 --- /dev/null +++ b/macro_synteny_paf/proto/macrosyntenyblocks_service/v1/macrosyntenyblocks.proto @@ -0,0 +1,44 @@ +syntax = "proto3"; + +import "block/v1/block.proto"; + +package legumeinfo.microservices.macrosyntenyblocks_service.v1; + + +service MacroSyntenyBlocks { + rpc Compute (MacroSyntenyBlocksComputeRequest) returns (MacroSyntenyBlocksComputeReply) {} + rpc ComputeByChromosome (MacroSyntenyBlocksComputeByChromosomeRequest) returns (MacroSyntenyBlocksComputeReply) {} +} + + +message MacroSyntenyBlocksComputeRequest { + repeated string chromosome = 1; + uint32 matched = 2; + uint32 intermediate = 3; + optional uint32 mask = 4; + repeated string targets = 5; + repeated string optionalMetrics = 6; + optional uint32 chromosomeGenes = 7; + optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks +} + + +message MacroSyntenyBlocksComputeByChromosomeRequest { + string chromosomeName = 1; + uint32 matched = 2; + uint32 intermediate = 3; + optional uint32 mask = 4; + repeated string targets = 5; + repeated string optionalMetrics = 6; + optional uint32 chromosomeGenes = 7; + optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks +} + + +message MacroSyntenyBlocksComputeReply { + repeated legumeinfo.microservices.block.v1.Blocks blocks = 1; +} diff --git a/macro_synteny_paf/proto/macrosyntenypaf_service/v1/macrosyntenypaf.proto b/macro_synteny_paf/proto/macrosyntenypaf_service/v1/macrosyntenypaf.proto new file mode 100644 index 00000000..aa05dc0e --- /dev/null +++ b/macro_synteny_paf/proto/macrosyntenypaf_service/v1/macrosyntenypaf.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +import "block/v1/block.proto"; + +package legumeinfo.microservices.macrosyntenypaf_service.v1; + + +service MacroSyntenyPaf { + rpc Compute (MacroSyntenyPafComputeRequest) returns (MacroSyntenyPafComputeReply) {} +} + + +message MacroSyntenyPafComputeRequest { + repeated string chromosome = 1; + uint32 matched = 2; + uint32 intermediate = 3; + optional uint32 mask = 4; + repeated string targets = 5; + repeated string optionalMetrics = 6; + optional uint32 chromosomeGenes = 7; + optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks +} + + +message MacroSyntenyPafComputeReply { + repeated legumeinfo.microservices.block.v1.Blocks blocks = 1; +} diff --git a/macro_synteny_paf/proto/track/v1/track.proto b/macro_synteny_paf/proto/track/v1/track.proto new file mode 100644 index 00000000..7a09cb1c --- /dev/null +++ b/macro_synteny_paf/proto/track/v1/track.proto @@ -0,0 +1,23 @@ +syntax = "proto3"; + +package legumeinfo.microservices.track.v1; + + +message Track { + string genus = 2; + string species = 3; + repeated string genes = 4; + repeated string families = 5; +} + + +message Chromosome { + uint32 length = 1; + Track track = 2; +} + + +message MicroTrack { + string name = 1; + Track track = 2; +} diff --git a/macro_synteny_paf/pytest.ini b/macro_synteny_paf/pytest.ini new file mode 100644 index 00000000..eb9ac230 --- /dev/null +++ b/macro_synteny_paf/pytest.ini @@ -0,0 +1,17 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +asyncio_mode = auto +markers = + unit: Unit tests (fast, no external dependencies) + integration: Integration tests (slower, may use Docker) + slow: Slow tests (>1s) +addopts = + -v + --tb=short + --strict-markers + --cov=macro_synteny_paf + --cov-report=term-missing + --cov-report=html diff --git a/macro_synteny_paf/requirements-test.txt b/macro_synteny_paf/requirements-test.txt new file mode 100644 index 00000000..b04ed5ed --- /dev/null +++ b/macro_synteny_paf/requirements-test.txt @@ -0,0 +1,5 @@ +pytest==8.3.5 +pytest-asyncio==0.24.0 +pytest-cov==6.0.0 +pytest-mock==3.14.0 +fakeredis==2.27.0 diff --git a/macro_synteny_paf/requirements.txt b/macro_synteny_paf/requirements.txt new file mode 100644 index 00000000..39895382 --- /dev/null +++ b/macro_synteny_paf/requirements.txt @@ -0,0 +1,51 @@ +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# +# pip-compile +# +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.13.0 + # via + # aiohttp-cors + # macro_synteny_paf (setup.py) +aiohttp-cors==0.8.1 + # via macro_synteny_paf (setup.py) +aiosignal==1.4.0 + # via aiohttp +attrs==25.4.0 + # via aiohttp +frozenlist==1.8.0 + # via + # aiohttp + # aiosignal +grpcio>=1.78.0 + # via + # grpcio-tools + # macro_synteny_paf (setup.py) +grpcio-tools>=1.78.0 + # via macro_synteny_paf (setup.py) +idna==3.11 + # via yarl +multidict==6.7.0 + # via + # aiohttp + # yarl +propcache==0.4.1 + # via + # aiohttp + # yarl +protobuf==6.32.1 + # via grpcio-tools +redis==6.4.0 + # via macro_synteny_paf (setup.py) +typing-extensions==4.15.0 + # via grpcio +uvloop==0.21.0 + # via macro_synteny_paf (setup.py) +yarl==1.22.0 + # via aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/macro_synteny_paf/run-tests.sh b/macro_synteny_paf/run-tests.sh new file mode 100755 index 00000000..df983e38 --- /dev/null +++ b/macro_synteny_paf/run-tests.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# +# This script uses the base compose configuration from gcv-docker-compose and overlays +# test-specific configuration. +# +# The GCV_DOCKER_COMPOSE environment variable should point to the gcv-docker-compose +# directory (defaults to ../ - the parent directory). +# +# Usage: +# ./run-tests.sh + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Compose file paths +# Use gcv-docker-compose directory for base configuration +GCV_DOCKER_COMPOSE="${GCV_DOCKER_COMPOSE:-../}" +COMPOSE_BASE="$GCV_DOCKER_COMPOSE/compose.yml" +COMPOSE_DEV="$GCV_DOCKER_COMPOSE/compose.dev.yml" +COMPOSE_TEST="compose.test.yml" + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Running macro_synteny_paf tests${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" + +# Verify compose files exist +if [ ! -f "$COMPOSE_BASE" ]; then + echo -e "${RED}Error: $COMPOSE_BASE not found${NC}" + echo -e "${YELLOW}Set GCV_DOCKER_COMPOSE environment variable to the gcv-docker-compose directory${NC}" + echo -e "${YELLOW}Default: ../ (parent directory)${NC}" + exit 1 +fi + +if [ ! -f "$COMPOSE_DEV" ]; then + echo -e "${RED}Error: $COMPOSE_DEV not found${NC}" + exit 1 +fi + +echo -e "${YELLOW}Using compose files from: ${GCV_DOCKER_COMPOSE}${NC}" + +# Save current directory +ORIGINAL_DIR=$(pwd) + +# Export paths for use in compose.test.yml +export MACRO_SYNTENY_PAF_DIR="$ORIGINAL_DIR" +export TEST_DATA_DIR="$(dirname "$ORIGINAL_DIR")" # Parent of macro_synteny_paf + +# Change to GCV docker compose directory for running compose +cd "$GCV_DOCKER_COMPOSE" + +# Make compose.test.yml path absolute +COMPOSE_TEST_ABS="$ORIGINAL_DIR/compose.test.yml" + +# Clean up any existing containers +echo -e "${YELLOW}Cleaning up existing containers...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v 2>/dev/null || true + +# Build and run tests +echo -e "${YELLOW}Building and starting services...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services up --build --abort-on-container-exit --exit-code-from macro_synteny_paf_test + +# Capture exit code +TEST_EXIT_CODE=$? + +# Clean up +echo "" +echo -e "${YELLOW}Cleaning up...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v + +# Return to original directory +cd "$ORIGINAL_DIR" + +# Report results +echo "" +echo -e "${GREEN}========================================${NC}" +if [ $TEST_EXIT_CODE -eq 0 ]; then + echo -e "${GREEN}✓ Tests passed!${NC}" +else + echo -e "${RED}✗ Tests failed with exit code $TEST_EXIT_CODE${NC}" +fi +echo -e "${GREEN}========================================${NC}" +echo "" +echo -e "Coverage report available in: ${YELLOW}./htmlcov/index.html${NC}" + +exit $TEST_EXIT_CODE diff --git a/macro_synteny_paf/setup.cfg b/macro_synteny_paf/setup.cfg new file mode 100644 index 00000000..bac1917f --- /dev/null +++ b/macro_synteny_paf/setup.cfg @@ -0,0 +1,42 @@ +[metadata] +name = macro_synteny_paf +version = attr: macro_synteny_paf.__version__ +description = A Microservice that returns pairwise macro-synteny blocks between all chromosomes of a query genome and a target genome, in PAF format +long_description = file: README.md +url = https://github.com/legumeinfo/microservices +author = Sven Redsun +author_email = sgr@ncgr.org +keywords = genomics, bioinformatics, microservices, redis, chado, gff, paf +license = Apache-2.0 +classifiers = + Development Status :: 4 - Beta + Environment :: Console + Intended Audience :: Science/Research + Topic :: Scientific/Engineering :: Bio-Informatics + License :: OSI Approved :: Apache Software License + Operating System :: OS Independent + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 +project_urls = + Bug Reports = https://github.com/legumeinfo/microservices/issues + Source = https://github.com/legumeinfo/microservices + + +[options] +packages = find: +python_requires = >=3.7,<4 +install_requires = + aiohttp + aiohttp-cors + grpcio + grpcio-tools + redis + uvloop + +[options.entry_points] +console_scripts = + chromosome = macro_synteny_paf.__main__:main diff --git a/macro_synteny_paf/setup.py b/macro_synteny_paf/setup.py new file mode 100644 index 00000000..ad82a3cf --- /dev/null +++ b/macro_synteny_paf/setup.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Python +import setuptools +from setuptools.command.build_py import build_py + +# package +from macro_synteny_paf import commands + +PACKAGE_DIRECTORIES = { + "": ".", +} + + +build_proto_command = "build_proto" + + +class BuildPy(build_py): + """Custom build_py command class.""" + + def run(self): + build_py.run(self) + self.run_command(build_proto_command) + + +SETUP_REQUIRES = ("grpcio-tools",) +COMMAND_CLASS = {build_proto_command: commands.BuildProtos, "build_py": BuildPy} + + +setuptools.setup( + package_dir=PACKAGE_DIRECTORIES, + setup_requires=SETUP_REQUIRES, + cmdclass=COMMAND_CLASS, +) diff --git a/macro_synteny_paf/tests/conftest.py b/macro_synteny_paf/tests/conftest.py new file mode 100644 index 00000000..76561a7d --- /dev/null +++ b/macro_synteny_paf/tests/conftest.py @@ -0,0 +1,90 @@ +from collections import namedtuple + +import pytest + +# Mock gRPC objects +Gene = namedtuple("Gene", ["name", "fmin", "fmax"]) +Chromosome = namedtuple("Chromosome", ["name", "genus", "species", "length", "track"]) +Track = namedtuple("Track", ["genes"]) +Block = namedtuple( + "Block", ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"] +) +Blocks = namedtuple( + "Blocks", ["chromosome", "genus", "species", "blocks", "chromosomeLength"] +) + + +@pytest.fixture +def sample_blocks_with_enrichment(): + """Sample blocks with enrichment (queryGeneFmin, queryGeneFmax, chromosomeLength).""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + Blocks = namedtuple( + "Blocks", ["chromosome", "genus", "species", "blocks", "chromosomeLength"] + ) + + return [ + Blocks( + chromosome="target_chr1", + genus="Target", + species="species", + chromosomeLength=10000, + blocks=[ + Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="+", + queryGeneFmin=0, + queryGeneFmax=2999, + ), + Block( + i=3, + j=4, + fmin=3000, + fmax=4999, + orientation="-", + queryGeneFmin=3000, + queryGeneFmax=4999, + ), + ], + ), + Blocks( + chromosome="target_chr2", + genus="Target", + species="species", + chromosomeLength=8000, + blocks=[ + Block( + i=1, + j=3, + fmin=1000, + fmax=3999, + orientation="+", + queryGeneFmin=1000, + queryGeneFmax=3999, + ), + ], + ), + ] + + +@pytest.fixture +def sample_blocks_without_enrichment(): + """Sample blocks without enrichment (legacy format).""" + Block = namedtuple("Block", ["i", "j", "fmin", "fmax", "orientation"]) + Blocks = namedtuple("Blocks", ["chromosome", "genus", "species", "blocks"]) + + return [ + Blocks( + chromosome="target_chr1", + genus="Target", + species="species", + blocks=[ + Block(i=0, j=2, fmin=0, fmax=2999, orientation="+"), + ], + ), + ] diff --git a/macro_synteny_paf/tests/test_request_handler.py b/macro_synteny_paf/tests/test_request_handler.py new file mode 100644 index 00000000..2e97b586 --- /dev/null +++ b/macro_synteny_paf/tests/test_request_handler.py @@ -0,0 +1,319 @@ +import hashlib +import json +from collections import namedtuple + +import pytest + +from macro_synteny_paf.request_handler import RequestHandler + + +@pytest.mark.unit +class TestGenerateCacheKey: + """Test cache key generation.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, + chromosome_address="localhost:8081", + genes_address="localhost:8082", + macrosyntenyblocks_address="localhost:8083", + ) + + def test_deterministic_key(self): + """Test that same inputs produce same key.""" + key1 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + key2 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + + assert key1 == key2 + + def test_different_formats_different_keys(self): + """Test that PAF and JSON formats have different cache keys.""" + key_paf = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + key_json = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "json" + ) + + assert key_paf != key_json + + def test_different_parameters_different_keys(self): + """Test that different parameters produce different keys.""" + key1 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + key2 = self.handler._generate_cache_key( + "genome1", "genome2", 20, 5, None, [], 10, 1, "paf" # Different matched + ) + + assert key1 != key2 + + def test_metrics_order_independent(self): + """Test that metrics list order doesn't affect key (sorted internally).""" + key1 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, ["jaccard", "levenshtein"], 10, 1, "paf" + ) + key2 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, ["levenshtein", "jaccard"], 10, 1, "paf" + ) + + assert key1 == key2 + + def test_key_format(self): + """Test that key has correct format with version prefix.""" + key = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + + # Should start with version prefix + assert key.startswith("synteny_cache:v2:") + + # Should be followed by SHA256 hash (64 hex chars) + hash_part = key.split(":")[-1] + assert len(hash_part) == 64 + assert all(c in "0123456789abcdef" for c in hash_part) + + def test_sha256_hash(self): + """Test that the hash is valid SHA256.""" + key = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + + hash_part = key.split(":")[-1] + + # Verify it's a valid SHA256 hash by checking length and hex format + try: + int(hash_part, 16) + assert len(hash_part) == 64 + except ValueError: + pytest.fail("Hash is not valid hexadecimal") + + def test_none_vs_mask_value(self): + """Test that None mask produces different key than mask value.""" + key1 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, None, [], 10, 1, "paf" + ) + key2 = self.handler._generate_cache_key( + "genome1", "genome2", 10, 5, 10, [], 10, 1, "paf" + ) + + assert key1 != key2 + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestBlockToPafRow: + """Test PAF format conversion.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, + chromosome_address="localhost:8081", + genes_address="localhost:8082", + macrosyntenyblocks_address="localhost:8083", + ) + + async def test_paf_format_with_enrichment(self): + """Test PAF row generation with enriched gene info.""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + block = Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="+", + queryGeneFmin=100, + queryGeneFmax=2500, + ) + + paf_row = await self.handler._blockToPafRow( + query_chromosome_name="query_chr", + query_chromosome_length=10000, + target_chromosome_name="target_chr", + target_chromosome_length=8000, + target_block=block, + ) + + # PAF format: qname qlen qstart qend strand tname tlen tstart tend matches alen mapq + fields = paf_row.strip().split("\t") + + assert len(fields) == 12 + assert fields[0] == "query_chr" + assert fields[1] == "10000" + assert fields[2] == "100" # queryGeneFmin + assert fields[3] == "2500" # queryGeneFmax + assert fields[4] == "+" + assert fields[5] == "target_chr" + assert fields[6] == "8000" + assert fields[7] == "0" # target fmin + assert fields[8] == "2999" # target fmax + + async def test_paf_reverse_orientation(self): + """Test PAF row with reverse orientation.""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + block = Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="-", + queryGeneFmin=100, + queryGeneFmax=2500, + ) + + paf_row = await self.handler._blockToPafRow( + query_chromosome_name="query_chr", + query_chromosome_length=10000, + target_chromosome_name="target_chr", + target_chromosome_length=8000, + target_block=block, + ) + + fields = paf_row.strip().split("\t") + assert fields[4] == "-" # Strand field + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestBlockToJson: + """Test JSON format conversion.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, + chromosome_address="localhost:8081", + genes_address="localhost:8082", + macrosyntenyblocks_address="localhost:8083", + ) + + async def test_json_format_with_enrichment(self): + """Test JSON object generation with enriched gene info.""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + block = Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="+", + queryGeneFmin=100, + queryGeneFmax=2500, + ) + + json_obj = await self.handler._blockToJson( + query_chromosome_name="query_chr", + query_chromosome_length=10000, + target_chromosome_name="target_chr", + target_chromosome_length=8000, + target_block=block, + ) + + # Verify JSON structure + assert "query" in json_obj + assert "target" in json_obj + assert "strand" in json_obj + + # Verify query fields + assert json_obj["query"]["name"] == "query_chr" + assert json_obj["query"]["length"] == 10000 + assert json_obj["query"]["start"] == 100 + assert json_obj["query"]["end"] == 2500 + + # Verify target fields + assert json_obj["target"]["name"] == "target_chr" + assert json_obj["target"]["length"] == 8000 + assert json_obj["target"]["start"] == 0 + assert json_obj["target"]["end"] == 2999 + + # Verify strand + assert json_obj["strand"] == "+" + + # Verify optional fields + assert "numResidueMatches" in json_obj + assert "alignmentBlockLength" in json_obj + assert "mappingQuality" in json_obj + + async def test_json_format_reverse_orientation(self): + """Test JSON object with reverse orientation.""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + block = Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="-", + queryGeneFmin=100, + queryGeneFmax=2500, + ) + + json_obj = await self.handler._blockToJson( + query_chromosome_name="query_chr", + query_chromosome_length=10000, + target_chromosome_name="target_chr", + target_chromosome_length=8000, + target_block=block, + ) + + assert json_obj["strand"] == "-" + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestBlocksToPafRows: + """Test batch conversion to PAF format.""" + + def setup_method(self): + self.handler = RequestHandler( + redis_connection=None, + chromosome_address="localhost:8081", + genes_address="localhost:8082", + macrosyntenyblocks_address="localhost:8083", + ) + + async def test_paf_rows_with_enriched_chromosome_length(self): + """Test that enriched chromosomeLength is used.""" + Block = namedtuple( + "Block", + ["i", "j", "fmin", "fmax", "orientation", "queryGeneFmin", "queryGeneFmax"], + ) + Blocks = namedtuple("Blocks", ["chromosome", "blocks", "chromosomeLength"]) + + target_block = Blocks( + chromosome="target_chr", + chromosomeLength=8000, # Enriched + blocks=[ + Block( + i=0, + j=2, + fmin=0, + fmax=2999, + orientation="+", + queryGeneFmin=100, + queryGeneFmax=2500, + ), + ], + ) + + paf_rows = await self.handler._blocksToPafRows( + query_chromosome_name="query_chr", + query_chromosome_length=10000, + target_block=target_block, + ) + + # Should use enriched chromosomeLength, not call chromosome service + assert "8000" in paf_rows diff --git a/micro_synteny_search/micro_synteny_search/commands.py b/micro_synteny_search/micro_synteny_search/commands.py index 2e8524e3..2851680c 100644 --- a/micro_synteny_search/micro_synteny_search/commands.py +++ b/micro_synteny_search/micro_synteny_search/commands.py @@ -24,8 +24,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -63,9 +63,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/micro_synteny_search/requirements.txt b/micro_synteny_search/requirements.txt index c40cfd25..e7cbea9a 100644 --- a/micro_synteny_search/requirements.txt +++ b/micro_synteny_search/requirements.txt @@ -20,11 +20,11 @@ frozenlist==1.7.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # grpcio-tools # micro_synteny_search (setup.py) -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via micro_synteny_search (setup.py) idna==3.10 # via yarl diff --git a/pairwise_macro_synteny_blocks/Dockerfile.test b/pairwise_macro_synteny_blocks/Dockerfile.test new file mode 100644 index 00000000..0398919b --- /dev/null +++ b/pairwise_macro_synteny_blocks/Dockerfile.test @@ -0,0 +1,35 @@ +FROM python:3.13.7-slim-trixie + +# install gcc and other build requirements +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY INSTALL ./ +COPY LICENSE ./ +COPY MANIFEST.in ./ +COPY README.md ./ +COPY setup.cfg ./ +COPY setup.py ./ +COPY requirements.txt ./ +COPY requirements-test.txt ./ +COPY pairwise_macro_synteny_blocks/ ./pairwise_macro_synteny_blocks/ +COPY proto/ ./proto/ +COPY tests/ ./tests/ +COPY pytest.ini ./ + +# install the package dependencies +RUN pip3 install --no-cache-dir -r requirements.txt + +# install test dependencies +RUN pip3 install --no-cache-dir -r requirements-test.txt + +# install (and implicitly build) the package +RUN pip3 install --no-cache-dir . + +# CRITICAL: Remove source directory to ensure tests use installed package +RUN rm -rf pairwise_macro_synteny_blocks/ + +CMD ["pytest", "-v", "--tb=short"] diff --git a/pairwise_macro_synteny_blocks/compose.test.yml b/pairwise_macro_synteny_blocks/compose.test.yml new file mode 100644 index 00000000..b01f31b2 --- /dev/null +++ b/pairwise_macro_synteny_blocks/compose.test.yml @@ -0,0 +1,34 @@ +# This file overrides services from +# github.com/legumeinfo/gcv-docker-compose compose.yml and compose.dev.yml +# +# Please use the convenience script: +# ./run-tests.sh + +services: + # Override redis to load test data + redis: + volumes: + # Load test data from dump.rdb + # TEST_DATA_DIR is set by run-tests.sh + - ${TEST_DATA_DIR}/tests/data/dump.rdb:/data/dump.rdb:ro + environment: + REDIS_ARGS: "" # Enable loading from dump.rdb + REDISEARCH_ARGS: "MAXSEARCHRESULTS 100000" + + # Add the test container + pairwise_macro_synteny_blocks_test: + build: + # PAIRWISE_MACRO_SYNTENY_BLOCKS_DIR is set by run-tests.sh + context: ${PAIRWISE_MACRO_SYNTENY_BLOCKS_DIR:-.} + dockerfile: Dockerfile.test + depends_on: + redis: + condition: service_healthy + environment: + # Service connection configuration + REDIS_HOST: redis + REDIS_PORT: "6379" + volumes: + # Mount test results for coverage reports + - ${PAIRWISE_MACRO_SYNTENY_BLOCKS_DIR:-.}/htmlcov:/app/htmlcov:rw + profiles: [] # Always run, no profile required diff --git a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/commands.py b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/commands.py index 74b2e383..9430cf7e 100644 --- a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/commands.py +++ b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/commands.py @@ -25,8 +25,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -64,9 +64,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [ diff --git a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/grpc_server.py b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/grpc_server.py index aa86b565..4ad56ea3 100644 --- a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/grpc_server.py +++ b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/grpc_server.py @@ -47,6 +47,8 @@ async def _compute(self, request, context): metrics = request.optionalMetrics or None chromosome_genes = request.chromosomeGenes or None chromosome_length = request.chromosomeLength or None + identity = request.identity or None + correspondences = request.correspondences or None try: ( chromosome, @@ -57,6 +59,8 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) = self.handler.parseArguments( chromosome, target, @@ -66,6 +70,8 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) except Exception: # raise a gRPC INVALID ARGUMENT error @@ -82,23 +88,40 @@ async def _compute(self, request, context): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) if blocks is None: # raise a gRPC NOT FOUND error await context.abort(grpc.StatusCode.NOT_FOUND, "Chromosome not found") - block_messages = list( - map( - lambda b: block_pb2.Block( - i=b["i"], - j=b["j"], - fmin=b["fmin"], - fmax=b["fmax"], - orientation=b["orientation"], - optionalMetrics=b.get("optionalMetrics", []), - ), - blocks, + + def block_to_proto(b): + proto_block = block_pb2.Block( + i=b["i"], + j=b["j"], + fmin=b["fmin"], + fmax=b["fmax"], + orientation=b["orientation"], + optionalMetrics=b.get("optionalMetrics", []), ) - ) + if "identity" in b: + proto_block.identity = b["identity"] + if "correspondences" in b: + # Each correspondence is a dict with query_index, target_index, target_fmin, target_fmax + proto_block.correspondences.extend( + [ + block_pb2.Correspondence( + query_index=corr["query_index"], + target_index=corr["target_index"], + target_fmin=corr["target_fmin"], + target_fmax=corr["target_fmax"], + ) + for corr in b["correspondences"] + ] + ) + return proto_block + + block_messages = list(map(block_to_proto, blocks)) return pairwisemacrosyntenyblocks_pb2.PairwiseMacroSyntenyBlocksComputeReply( blocks=block_messages ) diff --git a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/http_server.py b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/http_server.py index 0e1f40c1..2f41e29d 100644 --- a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/http_server.py +++ b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/http_server.py @@ -16,6 +16,8 @@ async def http_post_handler(request): metrics = data.get("optionalMetrics", None) chromosome_genes = data.get("chromosomeGenes", None) chromosome_length = data.get("chromosomeLength", None) + identity = data.get("identity", None) + correspondences = data.get("correspondences", None) handler = request.app["handler"] try: ( @@ -27,6 +29,8 @@ async def http_post_handler(request): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) = handler.parseArguments( chromosome, target, @@ -36,6 +40,8 @@ async def http_post_handler(request): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) except Exception: return web.HTTPBadRequest( @@ -50,6 +56,8 @@ async def http_post_handler(request): metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) if blocks is None: return web.HTTPNotFound(text="Chromosome not found") diff --git a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/metrics.py b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/metrics.py index c7630d74..3a97ca28 100644 --- a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/metrics.py +++ b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/metrics.py @@ -83,4 +83,22 @@ def levenshtein(a, b): return cost[lb] -METRICS = {"jaccard": jaccard, "levenshtein": levenshtein} +def levenshtein_identity(a, b): + # Returns 0.0 for no identity, 1.0 for identical sequences + distance = levenshtein(a, b) + max_length = max(len(a), len(b)) + return 1.0 - (distance / max_length) + + +def jaccard_identity(a, b, n=1, reversals=False, multiset=False): + # Returns 0.0 for no identity, 1.0 for identical sequences + distance = jaccard(a, b, n, reversals, multiset) + return 1.0 - distance + + +METRICS = { + "jaccard": jaccard, + "levenshtein": levenshtein, + "levenshtein_identity": levenshtein_identity, + "jaccard_identity": jaccard_identity, +} diff --git a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/request_handler.py b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/request_handler.py index 279f26f6..6907a386 100644 --- a/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/request_handler.py +++ b/pairwise_macro_synteny_blocks/pairwise_macro_synteny_blocks/request_handler.py @@ -27,6 +27,8 @@ def parseArguments( metrics, chromosome_genes, chromosome_length, + identity=None, + correspondences=None, ): iter(chromosome) # TypeError if not iterable if target is None: @@ -66,6 +68,12 @@ def parseArguments( name, args = self._parseMetric(metric) if name not in METRICS: raise ValueError(f'"{metric}" is not a valid metric') + # validate identity parameter + if identity is not None and identity not in ("levenshtein", "jaccard"): + raise ValueError('identity must be "levenshtein" or "jaccard"') + # validate correspondences parameter + if correspondences is not None and not isinstance(correspondences, bool): + raise ValueError("correspondences must be a boolean") return ( chromosome, target, @@ -75,6 +83,8 @@ def parseArguments( metrics, chromosome_genes, chromosome_length, + identity, + correspondences, ) # given a query chromosome and a target chromosome as ordered lists of @@ -118,12 +128,17 @@ def _indexBlocksViaIndexPathTraceback(self, path_ends, pointers, scores, matched if end in pointers: # note: singletons aren't in pointers if scores[end] < matched: break - begin = end - while begin in pointers: - begin = pointers.pop(begin) + # Collect the full path during traceback + path = [end] + current = end + while current in pointers: + current = pointers.pop(current) + path.append(current) + path.reverse() # Start to end order + begin = path[0] length = scores[end] - scores[begin] + 1 if length >= matched: - yield (begin, end) + yield (begin, end, path) # "constructs" a DAG using the index pairs as nodes and computes longest # forward (f_) and reverse (r_) oriented paths (blocks) using a recurrence @@ -190,6 +205,8 @@ async def process( metrics, chromosome_genes, chromosome_length, + identity=None, + correspondences=None, ): # connect to the indexes chromosome_index = AsyncSearch( @@ -239,7 +256,7 @@ def maskFilter(f): # convert the index blocks into output blocks blocks = [] pipeline = self.redis_connection.pipeline() - for begin_pair, end_pair in index_blocks: + for begin_pair, end_pair, path in index_blocks: # determine the query start/stop indexes and block orientation based on # the query index values query_start_index, query_stop_index, orientation = ( @@ -261,9 +278,8 @@ def maskFilter(f): "j": query_stop_index, "orientation": orientation, } - # compute optional metrics on the block - if metrics: - block["optionalMetrics"] = [] + # prepare gene families if metrics or identity is requested + if metrics or identity: query_families = list( filter( maskFilter, @@ -278,10 +294,20 @@ def maskFilter(f): ) if orientation == "-": target_families = target_families[::-1] - for metric in metrics: - name, args = self._parseMetric(metric) - value = METRICS[name](query_families, target_families, *args) - block["optionalMetrics"].append(value) + # compute optional metrics on the block + if metrics: + block["optionalMetrics"] = [] + for metric in metrics: + name, args = self._parseMetric(metric) + value = METRICS[name](query_families, target_families, *args) + block["optionalMetrics"].append(value) + # compute identity if requested + if identity: + identity_func = METRICS[f"{identity}_identity"] + block["identity"] = identity_func(query_families, target_families) + # add correspondences if requested + if correspondences: + block["correspondences"] = path blocks.append(block) locations = await pipeline.execute() for i, block in enumerate(blocks): @@ -290,4 +316,42 @@ def maskFilter(f): block["fmin"] = min(int(start_fmin), int(start_fmax)) block["fmax"] = max(int(end_fmin), int(end_fmax)) + # If correspondences requested, fetch target gene coordinates for each pair + if correspondences and any("correspondences" in block for block in blocks): + corr_pipeline = self.redis_connection.pipeline() + corr_counts = ( + [] + ) # Track correspondence count per block for offset calculation + for block in blocks: + if "correspondences" in block: + path = block["correspondences"] + for target_idx, query_idx in path: + corr_pipeline.lindex(f"{target_doc_id}:fmins", target_idx) + corr_pipeline.lindex(f"{target_doc_id}:fmaxs", target_idx) + corr_counts.append(len(path)) + else: + corr_counts.append(0) + + corr_locations = await corr_pipeline.execute() + + # Enrich correspondences with target coordinates + offset = 0 + for block, count in zip(blocks, corr_counts): + if count > 0: + enriched = [] + for i, (target_idx, query_idx) in enumerate( + block["correspondences"] + ): + idx = offset + i * 2 + enriched.append( + { + "query_index": query_idx, + "target_index": target_idx, + "target_fmin": int(corr_locations[idx]), + "target_fmax": int(corr_locations[idx + 1]), + } + ) + block["correspondences"] = enriched + offset += count * 2 + return blocks diff --git a/pairwise_macro_synteny_blocks/proto/block/v1/block.proto b/pairwise_macro_synteny_blocks/proto/block/v1/block.proto index 621bb867..d0ffb51a 100644 --- a/pairwise_macro_synteny_blocks/proto/block/v1/block.proto +++ b/pairwise_macro_synteny_blocks/proto/block/v1/block.proto @@ -10,6 +10,15 @@ message Blocks { repeated Block blocks = 4; } +message Correspondence { + uint32 query_index = 1; + uint32 target_index = 2; + uint32 target_fmin = 3; // Target gene start position (bp) + uint32 target_fmax = 4; // Target gene end position (bp) + optional uint32 query_fmin = 5; // Query gene start position (bp), if enriched + optional uint32 query_fmax = 6; // Query gene end position (bp), if enriched +} + message Block { uint32 i = 1; uint32 j = 2; @@ -17,4 +26,7 @@ message Block { uint32 fmax = 4; string orientation = 5; repeated float optionalMetrics = 6; + // Fields 7-9 reserved for queryGeneName, queryGeneFmin, queryGeneFmax (used by macro_synteny_blocks) + optional float identity = 10; // Identity score (0.0-1.0) if requested + repeated Correspondence correspondences = 11; // Gene pair correspondences within the block } diff --git a/pairwise_macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto b/pairwise_macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto index 5d12110d..173de58e 100644 --- a/pairwise_macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto +++ b/pairwise_macro_synteny_blocks/proto/pairwisemacrosyntenyblocks_service/v1/pairwisemacrosyntenyblocks.proto @@ -19,6 +19,8 @@ message PairwiseMacroSyntenyBlocksComputeRequest { repeated string optionalMetrics = 6; optional uint32 chromosomeGenes = 7; optional uint32 chromosomeLength = 8; + optional string identity = 9; // "levenshtein" or "jaccard" - computes identity metric + optional bool correspondences = 10; // If true, include gene pair correspondences within blocks } diff --git a/pairwise_macro_synteny_blocks/pytest.ini b/pairwise_macro_synteny_blocks/pytest.ini new file mode 100644 index 00000000..1eeda12a --- /dev/null +++ b/pairwise_macro_synteny_blocks/pytest.ini @@ -0,0 +1,17 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +asyncio_mode = auto +markers = + unit: Unit tests (fast, no external dependencies) + integration: Integration tests (slower, may use Docker) + slow: Slow tests (>1s) +addopts = + -v + --tb=short + --strict-markers + --cov=pairwise_macro_synteny_blocks + --cov-report=term-missing + --cov-report=html diff --git a/pairwise_macro_synteny_blocks/requirements-test.txt b/pairwise_macro_synteny_blocks/requirements-test.txt new file mode 100644 index 00000000..b04ed5ed --- /dev/null +++ b/pairwise_macro_synteny_blocks/requirements-test.txt @@ -0,0 +1,5 @@ +pytest==8.3.5 +pytest-asyncio==0.24.0 +pytest-cov==6.0.0 +pytest-mock==3.14.0 +fakeredis==2.27.0 diff --git a/pairwise_macro_synteny_blocks/requirements.txt b/pairwise_macro_synteny_blocks/requirements.txt index 4d29fc7b..d5a56401 100644 --- a/pairwise_macro_synteny_blocks/requirements.txt +++ b/pairwise_macro_synteny_blocks/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # pairwise_macro_synteny_blocks (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via pairwise_macro_synteny_blocks (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # grpcio-tools # pairwise_macro_synteny_blocks (setup.py) -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via pairwise_macro_synteny_blocks (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools redis==6.4.0 # via pairwise_macro_synteny_blocks (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via pairwise_macro_synteny_blocks (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/pairwise_macro_synteny_blocks/run-tests.sh b/pairwise_macro_synteny_blocks/run-tests.sh new file mode 100755 index 00000000..9b02646b --- /dev/null +++ b/pairwise_macro_synteny_blocks/run-tests.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# +# This script uses the base compose configuration from gcv-docker-compose and overlays +# test-specific configuration. +# +# The GCV_DOCKER_COMPOSE environment variable should point to the gcv-docker-compose +# directory (defaults to ../ - the parent directory). +# +# Usage: +# ./run-tests.sh + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Compose file paths +# Use gcv-docker-compose directory for base configuration +GCV_DOCKER_COMPOSE="${GCV_DOCKER_COMPOSE:-../}" +COMPOSE_BASE="$GCV_DOCKER_COMPOSE/compose.yml" +COMPOSE_DEV="$GCV_DOCKER_COMPOSE/compose.dev.yml" +COMPOSE_TEST="compose.test.yml" + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Running pairwise_macro_synteny_blocks tests${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" + +# Verify compose files exist +if [ ! -f "$COMPOSE_BASE" ]; then + echo -e "${RED}Error: $COMPOSE_BASE not found${NC}" + echo -e "${YELLOW}Set GCV_DOCKER_COMPOSE environment variable to the gcv-docker-compose directory${NC}" + echo -e "${YELLOW}Default: ../ (parent directory)${NC}" + exit 1 +fi + +if [ ! -f "$COMPOSE_DEV" ]; then + echo -e "${RED}Error: $COMPOSE_DEV not found${NC}" + exit 1 +fi + +echo -e "${YELLOW}Using compose files from: ${GCV_DOCKER_COMPOSE}${NC}" + +# Save current directory +ORIGINAL_DIR=$(pwd) + +# Export paths for use in compose.test.yml +export PAIRWISE_MACRO_SYNTENY_BLOCKS_DIR="$ORIGINAL_DIR" +export TEST_DATA_DIR="$(dirname "$ORIGINAL_DIR")" # Parent of pairwise_macro_synteny_blocks + +# Change to GCV docker compose directory for running compose +cd "$GCV_DOCKER_COMPOSE" + +# Make compose.test.yml path absolute +COMPOSE_TEST_ABS="$ORIGINAL_DIR/compose.test.yml" + +# Clean up any existing containers +echo -e "${YELLOW}Cleaning up existing containers...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v 2>/dev/null || true + +# Build and run tests +echo -e "${YELLOW}Building and starting services...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services up --build --abort-on-container-exit --exit-code-from pairwise_macro_synteny_blocks_test + +# Capture exit code +TEST_EXIT_CODE=$? + +# Clean up +echo "" +echo -e "${YELLOW}Cleaning up...${NC}" +docker compose -f "$COMPOSE_BASE" -f "$COMPOSE_DEV" -f "$COMPOSE_TEST_ABS" --profile services down -v + +# Return to original directory +cd "$ORIGINAL_DIR" + +# Report results +echo "" +echo -e "${GREEN}========================================${NC}" +if [ $TEST_EXIT_CODE -eq 0 ]; then + echo -e "${GREEN}✓ Tests passed!${NC}" +else + echo -e "${RED}✗ Tests failed with exit code $TEST_EXIT_CODE${NC}" +fi +echo -e "${GREEN}========================================${NC}" +echo "" +echo -e "Coverage report available in: ${YELLOW}./htmlcov/index.html${NC}" + +exit $TEST_EXIT_CODE diff --git a/pairwise_macro_synteny_blocks/tests/conftest.py b/pairwise_macro_synteny_blocks/tests/conftest.py new file mode 100644 index 00000000..0f6502b0 --- /dev/null +++ b/pairwise_macro_synteny_blocks/tests/conftest.py @@ -0,0 +1,78 @@ +import fakeredis.aioredis +import pytest + + +@pytest.fixture +async def fakeredis_connection(): + """In-memory Redis for unit tests.""" + redis = fakeredis.aioredis.FakeRedis(decode_responses=True) + yield redis + await redis.aclose() + + +@pytest.fixture +async def redis_with_chromosome(fakeredis_connection): + """Fixture providing fakeredis with sample chromosome data.""" + redis = fakeredis_connection + + # Populate with sample chromosome + target = "test_chr" + await redis.hset( + f"chromosome:{target}", + mapping={ + "name": target, + "genus": "Test", + "species": "species", + "length": "10000", + }, + ) + + # Gene families on the chromosome + families = ["fam1", "fam2", "fam3", "fam4", "fam5", "fam6", "fam7", "fam8"] + await redis.rpush(f"chromosome:{target}:families", *families) + + # Gene names + genes = [f"gene{i}" for i in range(1, 9)] + await redis.rpush(f"chromosome:{target}:genes", *genes) + + # Gene positions (fmin) + fmins = ["0", "1000", "2000", "3000", "4000", "5000", "6000", "7000"] + await redis.rpush(f"chromosome:{target}:fmins", *fmins) + + # Gene positions (fmax) + fmaxs = ["999", "1999", "2999", "3999", "4999", "5999", "6999", "7999"] + await redis.rpush(f"chromosome:{target}:fmaxs", *fmaxs) + + yield redis + + +@pytest.fixture +async def redis_with_multiple_chromosomes(fakeredis_connection): + """Fixture providing fakeredis with multiple chromosomes.""" + redis = fakeredis_connection + + # Create two chromosomes with different characteristics + for chr_id, (length, num_genes) in [("chr1", (10000, 8)), ("chr2", (5000, 4))]: + await redis.hset( + f"chromosome:{chr_id}", + mapping={ + "name": chr_id, + "genus": "Test", + "species": "species", + "length": str(length), + }, + ) + + families = [f"fam{i}" for i in range(1, num_genes + 1)] + await redis.rpush(f"chromosome:{chr_id}:families", *families) + + genes = [f"gene{chr_id}_{i}" for i in range(1, num_genes + 1)] + await redis.rpush(f"chromosome:{chr_id}:genes", *genes) + + fmins = [str(i * 1000) for i in range(num_genes)] + await redis.rpush(f"chromosome:{chr_id}:fmins", *fmins) + + fmaxs = [str(i * 1000 + 999) for i in range(num_genes)] + await redis.rpush(f"chromosome:{chr_id}:fmaxs", *fmaxs) + + yield redis diff --git a/pairwise_macro_synteny_blocks/tests/test_metrics.py b/pairwise_macro_synteny_blocks/tests/test_metrics.py new file mode 100644 index 00000000..2073ff8f --- /dev/null +++ b/pairwise_macro_synteny_blocks/tests/test_metrics.py @@ -0,0 +1,243 @@ +""" +Unit tests for distance metrics used in synteny block analysis. + +Tests Jaccard distance and Levenshtein distance calculations. +""" + +import pytest + +from pairwise_macro_synteny_blocks.metrics import jaccard, levenshtein + + +@pytest.mark.unit +class TestJaccardDistance: + """Test Jaccard distance metric.""" + + def test_identical_sequences(self): + """Distance should be 0 for identical sequences.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam1", "fam2", "fam3"] + + distance = jaccard(a, b) + + assert distance == 0 + + def test_completely_different(self): + """Distance should be 1 for no overlap.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam4", "fam5", "fam6"] + + distance = jaccard(a, b) + + assert distance == 1 + + def test_partial_overlap(self): + """Distance should reflect partial overlap.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam2", "fam3", "fam4"] + + distance = jaccard(a, b) + + # Intersection: {fam2, fam3} = 2 + # Union: {fam1, fam2, fam3, fam4} = 4 + # Jaccard index = 2/4 = 0.5 + # Jaccard distance = 1 - 0.5 = 0.5 + assert distance == 0.5 + + def test_with_2grams(self): + """Test with n=2 (bigrams).""" + a = ["fam1", "fam2", "fam3"] + b = ["fam1", "fam2", "fam4"] + + distance = jaccard(a, b, n=2) + + # a 2-grams: [(fam1,fam2), (fam2,fam3)] + # b 2-grams: [(fam1,fam2), (fam2,fam4)] + # Intersection: {(fam1,fam2)} = 1 + # Union: {(fam1,fam2), (fam2,fam3), (fam2,fam4)} = 3 + # Distance = 1 - 1/3 = 2/3 + assert abs(distance - 2 / 3) < 0.001 + + def test_with_3grams(self): + """Test with n=3 (trigrams).""" + a = ["fam1", "fam2", "fam3", "fam4"] + b = ["fam1", "fam2", "fam3", "fam5"] + + distance = jaccard(a, b, n=3) + + # a 3-grams: [(fam1,fam2,fam3), (fam2,fam3,fam4)] + # b 3-grams: [(fam1,fam2,fam3), (fam2,fam3,fam5)] + # Intersection: {(fam1,fam2,fam3)} = 1 + # Union: 3 + # Distance = 1 - 1/3 = 2/3 + assert abs(distance - 2 / 3) < 0.001 + + def test_n_larger_than_sequence(self): + """When n > sequence length, should return 1.""" + a = ["fam1", "fam2"] + b = ["fam1", "fam2"] + + distance = jaccard(a, b, n=5) + + # n=5 but sequences only have 2 elements + assert distance == 1 + + def test_with_reversals_bigrams(self): + """Test reversals with n=2.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam3", "fam2", "fam1"] + + distance = jaccard(a, b, n=2, reversals=True) + + # a 2-grams: [(fam1,fam2), (fam2,fam3)] + # b 2-grams: [(fam3,fam2), (fam2,fam1)] + # With reversals: (fam1,fam2) == (fam2,fam1) and (fam2,fam3) == (fam3,fam2) + assert distance == 0 + + def test_with_multiset(self): + """Test that multiset=True counts duplicates.""" + a = ["fam1", "fam1", "fam2"] + b = ["fam1", "fam2", "fam2"] + + distance_no_multiset = jaccard(a, b, n=1, multiset=False) + distance_with_multiset = jaccard(a, b, n=1, multiset=True) + + # Without multiset: {fam1, fam2} vs {fam1, fam2} -> distance = 0 + assert distance_no_multiset == 0 + + # With multiset: {fam1:2, fam2:1} vs {fam1:1, fam2:2} + # Intersection: min counts = {fam1:1, fam2:1} = 2 + # Union: max counts = {fam1:2, fam2:2} = 4 + # Distance = 1 - 2/4 = 0.5 + assert distance_with_multiset == 0.5 + + def test_string_arguments(self): + """Test that string arguments are correctly parsed.""" + a = ["fam1", "fam2"] + b = ["fam1", "fam2"] + + # n as string + distance = jaccard(a, b, n="2") + assert isinstance(distance, float) + + # reversals as string + distance = jaccard(a, b, reversals="True") + assert isinstance(distance, float) + + def test_empty_sequences(self): + """Test handling of empty sequences.""" + a = [] + b = ["fam1"] + + distance = jaccard(a, b, n=1) + + # Empty vs non-empty should give distance 1 + assert distance == 1 + + +@pytest.mark.unit +class TestLevenshteinDistance: + """Test Levenshtein edit distance metric.""" + + def test_identical_sequences(self): + """Distance should be 0 for identical sequences.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam1", "fam2", "fam3"] + + distance = levenshtein(a, b) + + assert distance == 0 + + def test_single_substitution(self): + """Distance should be 1 for single substitution.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam1", "fam4", "fam3"] # fam2 -> fam4 + + distance = levenshtein(a, b) + + assert distance == 1 + + def test_single_insertion(self): + """Distance should be 1 for single insertion.""" + a = ["fam1", "fam2"] + b = ["fam1", "fam2", "fam3"] # Inserted fam3 + + distance = levenshtein(a, b) + + assert distance == 1 + + def test_single_deletion(self): + """Distance should be 1 for single deletion.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam1", "fam3"] # Deleted fam2 + + distance = levenshtein(a, b) + + assert distance == 1 + + def test_multiple_operations(self): + """Test distance with multiple edit operations.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam4", "fam1", "fam5"] # sub + sub + sub = 3, or del + ins + ... = ? + + distance = levenshtein(a, b) + + # Should be 3 (all different positions) + assert distance == 3 + + def test_empty_to_nonempty(self): + """Distance from empty to non-empty is length of non-empty.""" + a = [] + b = ["fam1", "fam2", "fam3"] + + distance = levenshtein(a, b) + + assert distance == 3 + + def test_nonempty_to_empty(self): + """Distance from non-empty to empty is length of non-empty.""" + a = ["fam1", "fam2"] + b = [] + + distance = levenshtein(a, b) + + assert distance == 2 + + def test_both_empty(self): + """Distance between two empty sequences is 0.""" + a = [] + b = [] + + distance = levenshtein(a, b) + + assert distance == 0 + + def test_completely_different(self): + """Distance for completely different sequences.""" + a = ["fam1", "fam2", "fam3"] + b = ["fam4", "fam5", "fam6"] + + distance = levenshtein(a, b) + + # All substitutions = 3 + assert distance == 3 + + def test_longer_sequences(self): + """Test with longer sequences.""" + a = ["fam1", "fam2", "fam3", "fam4", "fam5"] + b = ["fam1", "fam2", "fam4", "fam5"] # Deleted fam3 + + distance = levenshtein(a, b) + + assert distance == 1 + + def test_optimized_for_length_swap(self): + """Test that algorithm swaps to optimize when b > a.""" + # Internally, algorithm swaps if lb > la for efficiency + a = ["fam1", "fam2"] + b = ["fam1", "fam2", "fam3", "fam4"] + + distance = levenshtein(a, b) + + # Should be 2 insertions + assert distance == 2 diff --git a/pairwise_macro_synteny_blocks/tests/test_request_handler.py b/pairwise_macro_synteny_blocks/tests/test_request_handler.py new file mode 100644 index 00000000..d5076fd2 --- /dev/null +++ b/pairwise_macro_synteny_blocks/tests/test_request_handler.py @@ -0,0 +1,360 @@ +import pytest + +from pairwise_macro_synteny_blocks.request_handler import RequestHandler + + +@pytest.mark.unit +class TestChromosomesToIndexPairs: + """Test the index pair generation from matching gene families.""" + + def setup_method(self): + self.handler = RequestHandler(redis_connection=None) + + def test_simple_match(self): + """Test basic matching between two chromosomes.""" + query = ["fam1", "fam2", "fam3", "fam4"] + target = ["fam4", "fam3", "fam2", "fam1"] + + pairs, masked = self.handler._chromosomesToIndexPairs( + query, target, mask=float("inf") + ) + + # Each target family matches one query family + assert len(pairs) == 4 + assert (0, 3) in pairs # target[0]=fam4 matches query[3]=fam4 + assert (1, 2) in pairs # target[1]=fam3 matches query[2]=fam3 + assert (2, 1) in pairs # target[2]=fam2 matches query[1]=fam2 + assert (3, 0) in pairs # target[3]=fam1 matches query[0]=fam1 + + def test_with_masking(self): + """Test that families exceeding mask threshold are excluded.""" + query = ["fam1", "fam1", "fam1", "fam2"] + target = ["fam1", "fam2"] + + pairs, masked = self.handler._chromosomesToIndexPairs(query, target, mask=2) + + # fam1 appears 3 times in query (exceeds mask=2), so should be masked + assert "fam1" in masked + # Only fam2 should match + assert len(pairs) == 1 + assert (1, 3) in pairs + + def test_duplicate_families_in_target(self): + """Test handling of duplicated families on target chromosome.""" + query = ["fam1", "fam2", "fam3"] + target = ["fam1", "fam1", "fam2", "fam3"] + + pairs, masked = self.handler._chromosomesToIndexPairs( + query, target, mask=float("inf") + ) + + # fam1 appears twice in target, should create 2 pairs + assert len(pairs) == 4 + assert (0, 0) in pairs # target[0]=fam1 matches query[0]=fam1 + assert (1, 0) in pairs # target[1]=fam1 matches query[0]=fam1 + assert (2, 1) in pairs # target[2]=fam2 matches query[1]=fam2 + assert (3, 2) in pairs # target[3]=fam3 matches query[2]=fam3 + + def test_target_masking(self): + """Test that target families exceeding mask are excluded.""" + query = ["fam1", "fam2"] + target = ["fam1", "fam1", "fam1", "fam2"] + + pairs, masked = self.handler._chromosomesToIndexPairs(query, target, mask=2) + + # fam1 appears 3 times in target (exceeds mask=2) + # So no fam1 pairs should be created + assert len(pairs) == 1 + assert (3, 1) in pairs # Only fam2 match + + +@pytest.mark.unit +class TestIndexPairsToIndexBlocks: + """Test the DAG-based longest path algorithm for synteny block detection.""" + + def setup_method(self): + self.handler = RequestHandler(redis_connection=None) + + def test_forward_orientation(self): + """Test detection of forward-oriented synteny block.""" + # Perfect forward diagonal: (0,0), (1,1), (2,2), (3,3) + pairs = [(0, 0), (1, 1), (2, 2), (3, 3)] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=3) + ) + + assert len(blocks) == 1 + begin, end, path = blocks[0] + assert (begin, end) == ((0, 0), (3, 3)) # begin, end + + def test_reverse_orientation(self): + """Test detection of reverse-oriented synteny block.""" + # Reverse diagonal: (0,3), (1,2), (2,1), (3,0) + pairs = [(0, 3), (1, 2), (2, 1), (3, 0)] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=3) + ) + + assert len(blocks) == 1 + begin, end, path = blocks[0] + assert (begin, end) == ((0, 3), (3, 0)) + + def test_intermediate_constraint(self): + """Test that intermediate distance constraint is enforced.""" + # Two separate blocks with large gap + pairs = [(0, 0), (1, 1), (10, 10), (11, 11)] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=2, matched=2) + ) + + # Should form 2 separate blocks due to gap + assert len(blocks) == 2 + endpoints = [(b[0], b[1]) for b in blocks] + assert ((0, 0), (1, 1)) in endpoints + assert ((10, 10), (11, 11)) in endpoints + + def test_matched_filtering(self): + """Test that blocks shorter than matched parameter are filtered.""" + # Small blocks that don't meet matched requirement + pairs = [(0, 0), (1, 1), (10, 10)] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=3) + ) + + # No blocks meet the matched=3 requirement + assert len(blocks) == 0 + + def test_multiple_blocks(self): + """Test detection of multiple independent synteny blocks.""" + # Two forward blocks separated by a gap + pairs = [ + (0, 0), + (1, 1), + (2, 2), # Block 1 + (10, 10), + (11, 11), + (12, 12), + (13, 13), # Block 2 + ] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=3) + ) + + assert len(blocks) == 2 + endpoints = [(b[0], b[1]) for b in blocks] + assert ((0, 0), (2, 2)) in endpoints + assert ((10, 10), (13, 13)) in endpoints + + def test_mixed_orientations(self): + """Test that forward and reverse blocks are detected independently.""" + # One forward block and one reverse block + pairs = [ + (0, 0), + (1, 1), + (2, 2), # Forward + (10, 20), + (11, 19), + (12, 18), # Reverse + ] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=3) + ) + + assert len(blocks) == 2 + endpoints = [(b[0], b[1]) for b in blocks] + + # Forward block + assert ((0, 0), (2, 2)) in endpoints + # Reverse block + assert ((10, 20), (12, 18)) in endpoints + + def test_greedy_selection(self): + """Test that highest scoring blocks are selected first (greedy).""" + # Overlapping paths where greedy selection matters + # (0,0) -> (1,1) -> (2,2) -> (3,3) [score 4] + # (1,1) -> (2,2) [score 2] + pairs = [(0, 0), (1, 1), (2, 2), (3, 3)] + + blocks = list( + self.handler._indexPairsToIndexBlocks(pairs, intermediate=1, matched=2) + ) + + # Should select the longest block (0,0) -> (3,3) + # The shorter overlapping block (1,1) -> (2,2) can't be selected + # because nodes are already used + assert len(blocks) == 1 + begin, end, path = blocks[0] + assert (begin, end) == ((0, 0), (3, 3)) + + +@pytest.mark.unit +class TestIndexPathTraceback: + """Test the greedy traceback algorithm.""" + + def setup_method(self): + self.handler = RequestHandler(redis_connection=None) + + def test_basic_traceback(self): + """Test basic traceback with a simple path.""" + # Path: node1 -> node2 -> node3 + node1, node2, node3 = (0, 0), (1, 1), (2, 2) + path_ends = [(3, node3)] # Score 3, ends at node3 + pointers = {node3: node2, node2: node1} + scores = {node1: 1, node2: 2, node3: 3} + + blocks = list( + self.handler._indexBlocksViaIndexPathTraceback( + path_ends, pointers, scores, matched=3 + ) + ) + + assert len(blocks) == 1 + begin, end, path = blocks[0] + assert (begin, end) == (node1, node3) + # Nodes should be consumed (removed from pointers) + assert len(pointers) == 0 + + def test_multiple_paths_highest_first(self): + """Test that paths are processed in highest score first order.""" + node1, node2, node3, node4 = (0, 0), (1, 1), (5, 5), (6, 6) + + # Two independent paths with different scores + path_ends = [ + (4, node2), # Score 4 (higher) + (2, node4), # Score 2 (lower) + ] + pointers = { + node2: node1, + node4: node3, + } + scores = {node1: 1, node2: 4, node3: 1, node4: 2} + + blocks = list( + self.handler._indexBlocksViaIndexPathTraceback( + path_ends, pointers, scores, matched=2 + ) + ) + + # Both blocks meet matched requirement + assert len(blocks) == 2 + # Higher score block should be yielded first + begin1, end1, path1 = blocks[0] + begin2, end2, path2 = blocks[1] + assert (begin1, end1) == (node1, node2) + assert (begin2, end2) == (node3, node4) + + def test_overlapping_paths_greedy_consumption(self): + """Test that once a node is used, it's unavailable for other blocks.""" + shared_node = (1, 1) + node1, node3 = (0, 0), (2, 2) + + # Two paths sharing a node + path_ends = [ + (3, node3), # Score 3, path: node1 -> shared_node -> node3 + (2, shared_node), # Score 2, ends at shared_node + ] + pointers = { + node3: shared_node, + shared_node: node1, + } + scores = {node1: 1, shared_node: 2, node3: 3} + + blocks = list( + self.handler._indexBlocksViaIndexPathTraceback( + path_ends, pointers, scores, matched=2 + ) + ) + + # Only the first (higher score) block should be selected + # The second block can't be formed because shared_node is consumed + assert len(blocks) == 1 + begin, end, path = blocks[0] + assert (begin, end) == (node1, node3) + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestProcessIntegration: + """Integration tests for the full process pipeline.""" + + async def test_process_chromosome_not_found(self, redis_with_chromosome): + """Test handling of non-existent target chromosome.""" + handler = RequestHandler(redis_with_chromosome) + + result = await handler.process( + query_chromosome=["fam1", "fam2"], + target="nonexistent_chr", + matched=2, + intermediate=5, + mask=None, + metrics=[], + chromosome_genes=2, + chromosome_length=1, + ) + + # Should return None when chromosome not found + assert result is None + + async def test_process_chromosome_too_short(self, redis_with_chromosome): + """Test filtering of chromosomes below length threshold.""" + handler = RequestHandler(redis_with_chromosome) + + result = await handler.process( + query_chromosome=["fam1", "fam2"], + target="test_chr", + matched=2, + intermediate=5, + mask=None, + metrics=[], + chromosome_genes=2, + chromosome_length=20000, # Larger than test_chr length (10000) + ) + + # Should return empty list for too-short chromosome + assert result == [] + + async def test_process_insufficient_genes(self, redis_with_chromosome): + """Test filtering when target has too few genes for a block.""" + handler = RequestHandler(redis_with_chromosome) + + result = await handler.process( + query_chromosome=["fam1", "fam2"], + target="test_chr", + matched=100, # More genes than exist + intermediate=5, + mask=None, + metrics=[], + chromosome_genes=100, + chromosome_length=1, + ) + + # Should return empty list when insufficient genes + assert result == [] + + async def test_process_with_masking(self, redis_with_chromosome): + """Test that masking parameter filters repetitive families.""" + handler = RequestHandler(redis_with_chromosome) + + # Create query with repeated families + query_chromosome = ["fam1", "fam1", "fam1", "fam2", "fam3"] + + blocks = await handler.process( + query_chromosome=query_chromosome, + target="test_chr", + matched=2, + intermediate=5, + mask=2, # fam1 should be masked + metrics=[], + chromosome_genes=2, + chromosome_length=1, + ) + + # Blocks should not include masked families + # (This is implicit in the algorithm, hard to assert directly) + assert isinstance(blocks, list) diff --git a/proto/block/v1/block.proto b/proto/block/v1/block.proto index 621bb867..36ee98ee 100644 --- a/proto/block/v1/block.proto +++ b/proto/block/v1/block.proto @@ -17,4 +17,6 @@ message Block { uint32 fmax = 4; string orientation = 5; repeated float optionalMetrics = 6; + // Fields 7-9 reserved for queryGeneName, queryGeneFmin, queryGeneFmax (used by macro_synteny_blocks) + optional float identity = 10; // Identity score (0.0-1.0) if requested } diff --git a/redis_loader/redis_loader/loaders/gff.py b/redis_loader/redis_loader/loaders/gff.py index 39474833..36f21e83 100644 --- a/redis_loader/redis_loader/loaders/gff.py +++ b/redis_loader/redis_loader/loaders/gff.py @@ -2,11 +2,41 @@ import codecs import csv import gzip +import tempfile from collections import defaultdict from urllib.request import urlopen, urlparse # dependencies -import gffutils +import pyranges1 as pr + + +def _open_gff_source(source): + """ + Opens a GFF source (local path or URL, optionally gzipped) and returns + a path that PyRanges can read. + + PyRanges read_gff3 expects a local file path, so for URLs we download + to a temporary file first. + + Parameters: + source (str): Local path or URL to a GFF file (may be gzipped). + + Returns: + str: Path to a readable GFF file. + """ + parsed = urlparse(source) + is_remote = parsed.scheme in ("http", "https", "ftp") + + if not is_remote: + # Local file - pyranges handles gzip automatically based on extension + return source + + # Remote URL - download to temp file + suffix = ".gff3.gz" if source.endswith(".gz") else ".gff3" + with tempfile.NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp: + with urlopen(source) as response: + tmp.write(response.read()) + return tmp.name def transferChromosomes(redisearch_loader, genus, species, chromosome_gff): @@ -14,32 +44,28 @@ def transferChromosomes(redisearch_loader, genus, species, chromosome_gff): Loads chromosomes from a GFF file into a RediSearch database. Parameters: - redisearch_loader (RediSearchLoader): The loader to use to load data into - RediSearch. - genus (str): The genus of the chromosomes being loaded. - species (str): The species of the chromosomes being loaded. - chromosome_gff (str): The local path or URL to the GFF to load chromosomes from. + redisearch_loader (RediSearchLoader): The loader to use to load data into + RediSearch. + genus (str): The genus of the chromosomes being loaded. + species (str): The species of the chromosomes being loaded. + chromosome_gff (str): The local path or URL to the GFF to load chromosomes from. Returns: - set[str]: A set containing the names of all the chromosomes that were - loaded. + set[str]: A set containing the names of all the chromosomes that were loaded. """ + gff_path = _open_gff_source(chromosome_gff) + gff = pr.read_gff3(gff_path) - # create chromosome SQLLite database from chromosomal GFF file - gffchr_db = gffutils.create_db( - chromosome_gff, - ":memory:", - force=True, - keep_order=True, - ) + # Filter to chromosome and supercontig features only + chromosomes = gff[gff.Feature.isin(["chromosome", "supercontig"])] # index the chromosomes chromosome_names = set() - for chr in gffchr_db.features_of_type( - ("chromosome", "supercontig"), order_by="attributes" - ): - name = chr.seqid - length = chr.end + for row in chromosomes.itertuples(): + name = row.Chromosome + # End is already converted to 0-based exclusive by pyranges, + # which equals the length for a feature starting at position 1 + length = row.End chromosome_names.add(name) redisearch_loader.indexChromosome(name, length, genus, species) @@ -51,74 +77,91 @@ def transferGenes(redisearch_loader, gene_gff, gfa, chromosome_names): Loads genes from a GFF file into a RediSearch database. Parameters: - redisearch_loader (RediSearchLoader): The loader to use to load data into - RediSearch. - gene_gff (str): The local path or URL to the GFF to load genes from. - gfa (str): The local path or URL to a GFA file containing gene family - associations for the genes being loaded. - chromosome_names (set[str]): A containing the names of all the chromosomes - that have been loaded. + redisearch_loader (RediSearchLoader): The loader to use to load data into + RediSearch. + gene_gff (str): The local path or URL to the GFF to load genes from. + gfa (str): The local path or URL to a GFA file containing gene family + associations for the genes being loaded. + chromosome_names (set[str]): A set containing the names of all the chromosomes + that have been loaded. """ + gff_path = _open_gff_source(gene_gff) + gff = pr.read_gff3(gff_path) - # create gene SQLLite database from gene GFF file - gffgene_db = gffutils.create_db(gene_gff, ":memory:", force=True, keep_order=True) + # Filter to gene features only + genes_df = gff[gff.Feature == "gene"] - # index all the genes in the db - gene_lookup = dict() + # Build gene lookup and chromosome groupings + strand_map = {"+": 1, "-": -1} + gene_lookup = {} chromosome_genes = defaultdict(list) - for gffgene in gffgene_db.features_of_type("gene", order_by="attributes"): - chr_name = gffgene.seqid - if chr_name in chromosome_names: - strand = 0 - if gffgene.strand == "+": - strand = 1 - if gffgene.strand == "-": - strand = -1 - gene = { - "name": gffgene.id, - "fmin": gffgene.start, - "fmax": gffgene.end, - "strand": strand, - "family": "", - } - gene_lookup[gffgene.id] = gene - chromosome_genes[chr_name].append(gene) - # deal with family assignments (for non-orphans) from GFA - with open(gfa, "rb") if urlparse(gfa).scheme == "" else urlopen(gfa) as fileobj: - tsv = gzip.GzipFile(fileobj=fileobj) if gfa.endswith("gz") else fileobj + + for row in genes_df.itertuples(): + chr_name = row.Chromosome + if chr_name not in chromosome_names: + continue + + gene_id = row.ID if hasattr(row, "ID") else row.Index + # PyRanges uses 0-based half-open coordinates [start, end) + # GFF3/gffutils uses 1-based closed coordinates [start, end] + # Convert back to 1-based to match original gffutils behavior + gene = { + "name": gene_id, + "fmin": row.Start + 1, # Convert 0-based to 1-based + "fmax": row.End, # End is same in both systems + "strand": strand_map.get(row.Strand, 0), + "family": "", + } + gene_lookup[gene_id] = gene + chromosome_genes[chr_name].append(gene) + + # Load family assignments from GFA file + _load_gene_families(gfa, gene_lookup) + + # Index the genes + for chr_name, genes in chromosome_genes.items(): + redisearch_loader.indexChromosomeGenes(chr_name, genes) + + +def _load_gene_families(gfa, gene_lookup): + """ + Loads gene family assignments from a GFA file into the gene lookup dict. + + Parameters: + gfa (str): Local path or URL to a GFA file (may be gzipped). + gene_lookup (dict): Dictionary mapping gene IDs to gene dicts. + """ + parsed = urlparse(gfa) + is_remote = parsed.scheme in ("http", "https", "ftp") + + with open(gfa, "rb") if not is_remote else urlopen(gfa) as fileobj: + tsv = gzip.GzipFile(fileobj=fileobj) if gfa.endswith(".gz") else fileobj for line in csv.reader(codecs.iterdecode(tsv, "utf-8"), delimiter="\t"): - # skip comment and metadata lines - if line[0].startswith("#") or line[0] == "ScoreMeaning": + # Skip comment and metadata lines + if not line or line[0].startswith("#") or line[0] == "ScoreMeaning": continue gene_id = line[0] if gene_id in gene_lookup: - gene = gene_lookup[gene_id] - genefamily_id = line[1] - gene["family"] = genefamily_id - - # index the genes - for chr_name, genes in chromosome_genes.items(): - redisearch_loader.indexChromosomeGenes(chr_name, genes) + gene_lookup[gene_id]["family"] = line[1] def loadFromGFF( redisearch_loader, genus, species, strain, chromosome_gff, gene_gff, gfa ): """ - Loads data from a GFF files into a RediSearch database. + Loads data from GFF files into a RediSearch database. Parameters: - redisearch_loader (RediSearchLoader): The loader to use to load data into - RediSearch. - genus (str): The genus of the data being loaded. - species (str): The species of the data being loaded. - strain (str): The strain of the data being loaded. - chromosome_gff (pathlib.Path): The path to the GFF to load chromosomes from. - gene_gff (pathlib.Path): The path to the GFF to load genes from. - gfa (pathlib.Path): The path to a GFA file containing gene family - associations for the genes being loaded. + redisearch_loader (RediSearchLoader): The loader to use to load data into + RediSearch. + genus (str): The genus of the data being loaded. + species (str): The species of the data being loaded. + strain (str): The strain of the data being loaded. + chromosome_gff (str): The path or URL to the GFF to load chromosomes from. + gene_gff (str): The path or URL to the GFF to load genes from. + gfa (str): The path or URL to a GFA file containing gene family + associations for the genes being loaded. """ - # HACK the species to contain the strain name if given if strain is not None: species += ":" + strain diff --git a/redis_loader/setup.cfg b/redis_loader/setup.cfg index 73d2e373..0ac86f24 100644 --- a/redis_loader/setup.cfg +++ b/redis_loader/setup.cfg @@ -17,10 +17,8 @@ classifiers = Operating System :: OS Independent Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 project_urls = Bug Reports = https://github.com/legumeinfo/microservices/issues Source = https://github.com/legumeinfo/microservices @@ -28,9 +26,9 @@ project_urls = [options] packages = find: -python_requires = >=3.7, <4 +python_requires = >=3.12, <4 install_requires = - gffutils + pyranges1 psycopg2 redis diff --git a/search/requirements.txt b/search/requirements.txt index 68f779f9..4b6feef7 100644 --- a/search/requirements.txt +++ b/search/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.15 +aiohttp==3.13.0 # via # aiohttp-cors # search (setup.py) @@ -14,35 +14,37 @@ aiohttp-cors==0.8.1 # via search (setup.py) aiosignal==1.4.0 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -frozenlist==1.7.0 +frozenlist==1.8.0 # via # aiohttp # aiosignal -grpcio==1.74.0 +grpcio>=1.78.0 # via # grpcio-tools # search (setup.py) -grpcio-tools==1.74.0 +grpcio-tools>=1.78.0 # via search (setup.py) -idna==3.10 +idna==3.11 # via yarl -multidict==6.6.4 +multidict==6.7.0 # via # aiohttp # yarl -propcache==0.3.2 +propcache==0.4.1 # via # aiohttp # yarl -protobuf==6.32.0 +protobuf==6.32.1 # via grpcio-tools -pyparsing==3.2.3 +pyparsing==3.2.5 # via search (setup.py) +typing-extensions==4.15.0 + # via grpcio uvloop==0.21.0 # via search (setup.py) -yarl==1.20.1 +yarl==1.22.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/search/search/commands.py b/search/search/commands.py index 7a590ea7..6ed18850 100644 --- a/search/search/commands.py +++ b/search/search/commands.py @@ -22,8 +22,8 @@ import os import sys +from importlib.resources import files as resource_files -import pkg_resources import setuptools @@ -61,9 +61,7 @@ def build_package_protos(self): if filename.endswith(".proto"): proto_files.append(os.path.abspath(os.path.join(root, filename))) - well_known_protos_include = pkg_resources.resource_filename( - "grpc_tools", "_proto" - ) + well_known_protos_include = str(resource_files("grpc_tools").joinpath("_proto")) for proto_file in proto_files: command = [