Merge pull request #16 from ESA-PhiLab/copilot/fix-a5c4f09d-5aed-4188-8b33-d9ae22bd2177

sirbastiano · web-flow · commit 497a51aff495 · 2026-01-18T13:21:31.000+01:00
diff --git a/notebooks/1_search_n_download.ipynb b/notebooks/1_search_n_download.ipynb
@@ -38,7 +38,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of results: 910\n"
+      "Number of results: 5337\n"
      ]
     },
     {
@@ -289,10 +289,10 @@
     "    orbit_direction=None,\n",
     "    cloud_cover_threshold=None,\n",
     "    aoi_wkt=aoi_wkt,  # Example: aoi_wkt=aoi_wkt if you want to use the defined AOI\n",
-    "    start_date = '2023-05-03T00:00:00',\n",
+    "    start_date = '2020-05-03T00:00:00',\n",
     "    end_date = '2024-05-03T04:00:00',\n",
     "    top=1000,\n",
-    "    count=True,  # Set to True to get the total count of results\n",
+    "    count=True,  # Set to True to get all the results, superseed top arg.\n",
     "    attributes={'processingLevel':'LEVEL0',\n",
     "                'operationalMode': 'SM',\n",
     "                # 'swathIdentifier': 'S1', # Swath identifier is: 1,2,3,4,5,6 for RAW\n",
@@ -353,7 +353,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "phidown-3.9",
    "language": "python",
    "name": "python3"
   },
@@ -367,7 +367,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/phidown/search.py b/phidown/search.py
@@ -5,6 +5,7 @@
 import typing
 from datetime import datetime
 import copy 
+import asyncio
 
 from .downloader import pull_down
 
@@ -111,7 +112,6 @@ def query_by_filter(
             start_date (str, optional): Start date for filtering (ISO 8601 format). Defaults to None.
             end_date (str, optional): End date for filtering (ISO 8601 format). Defaults to None.
             top (int, optional): Maximum number of results to retrieve. Defaults to 1000.
-            count (bool, optional): Whether to include count of results. Defaults to False.
             order_by (str, optional): Field and direction to order results by. Defaults to "ContentDate/Start desc".
             burst_mode (bool, optional): Enable Sentinel-1 SLC Burst mode searching. Defaults to False.
             burst_id (int, optional): Burst ID to filter (burst mode only). Defaults to None.
@@ -161,6 +161,8 @@ def query_by_filter(
         self._validate_time() # Validate start and end dates
 
         self.top = top
+        if self.count:
+            self.top = 1000
         self._validate_top()
 
         self.order_by = order_by
@@ -680,15 +682,89 @@ def _build_query(self):
         return self.url
 
     def execute_query(self):
-        """Execute the query and retrieve data"""
+        """Execute the query and retrieve data.
+        
+        If count=True and the total number of results exceeds the 'top' limit,
+        this method will automatically paginate through all results using
+        multiple requests with the $skip parameter, combining all results
+        into a single DataFrame.
+        
+        Returns:
+            pd.DataFrame: DataFrame containing all retrieved products.
+        """
         url = self._build_query()
         self.response = copy.deepcopy(requests.get(url))
         self.response.raise_for_status()  # Raise an error for bad status codes
 
         self.json_data = self.response.json()
         self.num_results = self.json_data.get('@odata.count', 0)
-        self.df = pd.DataFrame.from_dict(self.json_data['value'])
+        
+        # Check if pagination is needed
+        if self.count and self.num_results > self.top:
+            return self._execute_paginated_query()
+        else:
+            self.df = pd.DataFrame.from_dict(self.json_data['value'])
+            return self.df
+
+    def _execute_paginated_query(self):
+        """Execute paginated queries when results exceed top limit using asyncio"""
+        all_data = []
+        
+        # Add first page (already retrieved in execute_query)
+        if 'value' in self.json_data:
+            all_data.extend(self.json_data['value'])
+            
+        page_size = self.top  # Use the current top value as page size
+        
+        # Calculate skips based on total results and page size
+        skips = range(page_size, self.num_results, page_size)
+        
+        if not skips:
+            self.df = pd.DataFrame.from_dict(all_data)
+            return self.df
+
+        urls = []
+        for skip in skips:
+            paginated_query = f"?$filter={self.filter_condition}&$orderby={self.order_by}&$top={page_size}&$skip={skip}&$expand=Attributes"
+            if self.count:
+                paginated_query += "&$count=true"
+            urls.append(f"{self.base_url}{paginated_query}")
+            
+        async def fetch_url(url):
+            loop = asyncio.get_running_loop()
+            try:
+                response = await loop.run_in_executor(None, requests.get, url)
+                response.raise_for_status()
+                return response.json()
+            except Exception as e:
+                return e
 
+        async def fetch_all(urls):
+            tasks = [fetch_url(url) for url in urls]
+            return await asyncio.gather(*tasks, return_exceptions=True)
+
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+
+        if loop and loop.is_running():
+            # If in a running loop (e.g. Jupyter), run the new loop in a separate thread
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                results = pool.submit(asyncio.run, fetch_all(urls)).result()
+        else:
+            results = asyncio.run(fetch_all(urls))
+
+        # Process results
+        for res in results:
+            if isinstance(res, Exception):
+                print(f"Warning: Error retrieving page: {res}")
+            elif isinstance(res, dict) and 'value' in res:
+                all_data.extend(res['value'])
+        
+        # Create DataFrame from all collected data
+        self.df = pd.DataFrame.from_dict(all_data)
         return self.df
 
     def query_by_name(self, product_name: str) -> pd.DataFrame:
diff --git a/tests/test_pagination.py b/tests/test_pagination.py
@@ -0,0 +1,179 @@
+import pytest
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from phidown.search import CopernicusDataSearcher
+from unittest.mock import Mock, patch
+import pandas as pd
+
+# Define the path to the config file relative to the test file
+CONFIG_PATH = os.path.join(os.path.dirname(__file__), '..', 'phidown', 'config.json')
+
+
+def test_pagination_disabled_by_default():
+    """Test that pagination is not triggered when count=False"""
+    searcher = CopernicusDataSearcher()
+    searcher.query_by_filter(
+        collection_name='SENTINEL-1',
+        product_type='SLC',
+        top=10,
+        count=False  # Pagination should not trigger
+    )
+    
+    # Mock response with large count
+    mock_response = Mock()
+    mock_response.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(10)],
+        '@odata.count': 1500  # More than top=10, but count=False
+    }
+    mock_response.raise_for_status = Mock()
+    
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        df = searcher.execute_query()
+        
+    # Should only make one request since count=False
+    assert mock_get.call_count == 1
+    assert len(df) == 10  # Only the first page
+
+
+def test_pagination_when_count_enabled_and_results_exceed_top():
+    """Test pagination is triggered when count=True and results > top"""
+    searcher = CopernicusDataSearcher()
+    searcher.query_by_filter(
+        collection_name='SENTINEL-1',
+        product_type='SLC',
+        top=5,
+        count=True
+    )
+    
+    # Mock responses for pagination
+    mock_response_1 = Mock()
+    mock_response_1.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5)],
+        '@odata.count': 12
+    }
+    mock_response_1.raise_for_status = Mock()
+    
+    mock_response_2 = Mock()
+    mock_response_2.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5, 10)]
+    }
+    mock_response_2.raise_for_status = Mock()
+    
+    mock_response_3 = Mock()
+    mock_response_3.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(10, 12)]
+    }
+    mock_response_3.raise_for_status = Mock()
+    
+    with patch('requests.get', side_effect=[mock_response_1, mock_response_2, mock_response_3]) as mock_get:
+        df = searcher.execute_query()
+        
+    # Should make 3 requests total
+    assert mock_get.call_count == 3
+    assert len(df) == 12
+    
+    # Check that skip parameters were used correctly
+    calls = mock_get.call_args_list
+    assert '$skip=5' in calls[1][0][0]
+    assert '$skip=10' in calls[2][0][0]
+
+
+def test_no_pagination_when_results_within_top_limit():
+    """Test no pagination when count=True but results <= top"""
+    searcher = CopernicusDataSearcher()
+    searcher.query_by_filter(
+        collection_name='SENTINEL-1',
+        product_type='SLC',
+        top=100,
+        count=True
+    )
+    
+    # Mock response with count less than top
+    mock_response = Mock()
+    mock_response.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(50)],
+        '@odata.count': 50  # Less than top=100
+    }
+    mock_response.raise_for_status = Mock()
+    
+    with patch('requests.get', return_value=mock_response) as mock_get:
+        df = searcher.execute_query()
+        
+    # Should only make one request
+    assert mock_get.call_count == 1
+    assert len(df) == 50
+
+
+def test_pagination_with_1000_page_size():
+    """Test pagination with default page size of 1000"""
+    searcher = CopernicusDataSearcher()
+    searcher.query_by_filter(
+        collection_name='SENTINEL-1',
+        product_type='SLC',
+        top=1000,  # Default page size
+        count=True
+    )
+    
+    # Mock responses for large dataset
+    mock_response_1 = Mock()
+    mock_response_1.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(1000)],
+        '@odata.count': 2500
+    }
+    mock_response_1.raise_for_status = Mock()
+    
+    mock_response_2 = Mock()
+    mock_response_2.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(1000, 2000)]
+    }
+    mock_response_2.raise_for_status = Mock()
+    
+    mock_response_3 = Mock()
+    mock_response_3.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(2000, 2500)]
+    }
+    mock_response_3.raise_for_status = Mock()
+    
+    with patch('requests.get', side_effect=[mock_response_1, mock_response_2, mock_response_3]) as mock_get:
+        df = searcher.execute_query()
+        
+    # Should make 3 requests total
+    assert mock_get.call_count == 3
+    assert len(df) == 2500
+    
+    # Check skip parameters
+    calls = mock_get.call_args_list
+    assert '$skip=1000' in calls[1][0][0]
+    assert '$skip=2000' in calls[2][0][0]
+
+
+def test_pagination_handles_request_errors_gracefully():
+    """Test that pagination handles request errors gracefully"""
+    searcher = CopernicusDataSearcher()
+    searcher.query_by_filter(
+        collection_name='SENTINEL-1',
+        product_type='SLC',
+        top=5,
+        count=True
+    )
+    
+    # Mock first response successful
+    mock_response_1 = Mock()
+    mock_response_1.json.return_value = {
+        'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5)],
+        '@odata.count': 15
+    }
+    mock_response_1.raise_for_status = Mock()
+    
+    # Mock second response fails
+    mock_response_2 = Mock()
+    mock_response_2.raise_for_status.side_effect = Exception("Network error")
+    
+    with patch('requests.get', side_effect=[mock_response_1, mock_response_2]):
+        # Should not raise exception, but return partial results
+        df = searcher.execute_query()
+        
+    # Should return at least the first page
+    assert len(df) == 5
+    assert 'product_0' in df['Id'].values