Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions helpers/hasyncio.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class _EventLoop(async_solipsism.EventLoop):
# the replayed time approach and can remove `ReplayedTime` object.
def __init__(self) -> None:
super().__init__()
self._initial_dt = datetime.datetime.utcnow()
self._initial_dt = datetime.datetime.now(datetime.timezone.utc)

def get_current_time(self) -> datetime.datetime:
# `loop.time()` returns the number of seconds as `float` from when the event
Expand Down Expand Up @@ -100,7 +100,7 @@ def solipsism_context() -> Iterator:


async def gather_coroutines_with_wall_clock(
event_loop: asyncio.AbstractEventLoop, *coroutines: List[Coroutine]
event_loop: asyncio.AbstractEventLoop, *coroutines: Callable[..., Coroutine]
) -> List[Any]:
"""
Inject a wall clock associated to `event_loop` in all the coroutines and
Expand All @@ -111,9 +111,9 @@ async def gather_coroutines_with_wall_clock(
)
# Construct the coroutines here by passing the `get_wall_clock_time()`
# function.
coroutines = [coro(get_wall_clock_time) for coro in coroutines]
coroutine_instances = [coro(get_wall_clock_time) for coro in coroutines]
#
result: List[Any] = await asyncio.gather(*coroutines)
result: List[Any] = await asyncio.gather(*coroutine_instances)
return result


Expand Down
5 changes: 4 additions & 1 deletion helpers/hcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ def _get_cache_size(path: str, description: str) -> str:
else:
if os.path.exists(path):
size_in_bytes = hsystem.du(path)
size_as_str = hintros.format_size(size_in_bytes)
if isinstance(size_in_bytes, str):
size_as_str = size_in_bytes
else:
size_as_str = hintros.format_size(size_in_bytes)
else:
size_as_str = "nan"
# TODO(gp): Compute number of files.
Expand Down
7 changes: 5 additions & 2 deletions helpers/hcache_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import os
import pickle
import re
from typing import Any, Callable, Dict, List, Union, cast
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Union, cast

if TYPE_CHECKING:
import pandas as pd

import helpers.hdbg as hdbg
import helpers.hprint as hprint
Expand Down Expand Up @@ -406,7 +409,7 @@ def get_cache(func_name: str) -> _CacheType:
# #############################################################################


def cache_stats_to_str(func_name: str = "") -> "pd.DataFrame": # noqa: F821
def cache_stats_to_str(func_name: str = "") -> "pd.DataFrame":
"""
Print the cache stats for a function or for all functions.

Expand Down
12 changes: 8 additions & 4 deletions helpers/hchatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
import os
import sys
import time
from typing import Dict, List, Optional
from typing import TYPE_CHECKING, Dict, List, Optional

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hio as hio

henv.install_module_if_not_present("openai")
import openai # noqa: E402
if TYPE_CHECKING:
import openai # type: ignore
else:
henv.install_module_if_not_present("openai")
import openai # noqa: E402

_LOG = logging.getLogger(__name__)

Expand All @@ -41,7 +44,7 @@ def create_assistant(
model: str = "gpt-3.5-turbo-1106",
use_retrieval: bool = True,
use_code_interpreter: bool = True,
use_function: Dict = None,
use_function: Optional[Dict] = None,
) -> str:
"""
Create an OpenAI Assistant for your OpenAI Organization. All configs can
Expand Down Expand Up @@ -175,6 +178,7 @@ def _path_to_dict(path: str) -> Dict:
tree = {d: _path_to_dict(os.path.join(root, d)) for d in dirs}
tree.update({f: {"name": f} for f in files})
return tree
return {}


# TODO(Henry): We use fileIO here to store the directory structure, which may
Expand Down
6 changes: 3 additions & 3 deletions helpers/hcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import ast
import logging
import os
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, cast

import pandas as pd

Expand Down Expand Up @@ -45,7 +45,7 @@ def _read_csv_range(
"""
hdbg.dassert_lt(0, from_, msg="Row 0 assumed to be header row")
hdbg.dassert_lt(from_, to, msg="Empty range requested!")
skiprows = range(1, from_)
skiprows = list(range(1, from_))
nrows = to - from_
df = pd.read_csv(csv_path, skiprows=skiprows, nrows=nrows, **kwargs)
if df.shape[0] < to:
Expand Down Expand Up @@ -236,7 +236,7 @@ def convert_csv_dir_to_pq_dir(
if hs3.is_s3_path(csv_dir):
# TODO(gp): Pass aws_profile.
s3fs = hs3.get_s3fs("am")
filenames = s3fs.ls(csv_dir)
filenames = cast(Any, s3fs).ls(csv_dir)
else:
# Local filesystem.
hdbg.dassert_dir_exists(csv_dir)
Expand Down
7 changes: 4 additions & 3 deletions helpers/hdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,10 @@ def infer_sampling_points_per_year(df: Union[pd.Series, pd.DataFrame]) -> float:
:return: number of time points per year (approximate)
"""
hdbg.dassert(hasattr(df.index, "freq") and df.index.freq is not None)
assert hasattr(df.index, "freq") and df.index.freq is not None
freq = df.index.freq
# TODO(*): Make start, end dates parameters that can be passed in.
return compute_points_per_year_for_given_freq(freq)
return compute_points_per_year_for_given_freq(str(cast(Any, freq)))


@functools.lru_cache()
Expand Down Expand Up @@ -274,7 +275,7 @@ def compute_count_per_year(df: Union[pd.Series, pd.DataFrame]) -> float:
assert hasattr(df.index, "freq") and df.index.freq is not None
freq = df.index.freq
# Calculate the time span of `df` in years.
points_per_year = compute_points_per_year_for_given_freq(freq)
points_per_year = compute_points_per_year_for_given_freq(str(freq))
span_in_years = df.size / points_per_year
# Determine the number of non-NaN/inf/etc. data points per year.
count_per_year = df.count() / span_in_years
Expand Down Expand Up @@ -302,7 +303,7 @@ def remove_duplicates(
# Fix maximum value of control column at the bottom.
if control_column:
df = df.sort_values(by=control_column)
duplicate_columns = duplicate_columns or df.columns
duplicate_columns = duplicate_columns or list(df.columns)
df = df.drop_duplicates(subset=duplicate_columns)
# Sort by index to return to original view.
df = df.sort_index()
Expand Down
23 changes: 12 additions & 11 deletions helpers/hdatetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import datetime
import logging
import re
from typing import Callable, Iterable, Optional, Tuple, Union
from typing import Any, Callable, Iterable, Optional, Tuple, Union, cast

# TODO(gp): Use hdbg.WARNING
_WARNING = "\033[33mWARNING\033[0m"
Expand Down Expand Up @@ -371,11 +371,12 @@ def get_current_time(
# We accept only `hasyncio.EventLoop` here. If we are using standard asyncio
# EventLoop we rely on wall-clock time instead of `loop.time()`.
hdbg.dassert_isinstance(event_loop, asyncio.AbstractEventLoop)
assert isinstance(event_loop, asyncio.AbstractEventLoop)
hdbg.dassert(hasattr(event_loop, "get_current_time"))
timestamp = event_loop.get_current_time()
timestamp = cast(Any, event_loop).get_current_time()
else:
# Use true real-time.
timestamp = datetime.datetime.utcnow()
timestamp = datetime.datetime.now(datetime.timezone.utc)
# Convert it into the right
timestamp = pd.Timestamp(timestamp, tz=get_UTC_tz())
if tz == "UTC":
Expand Down Expand Up @@ -616,9 +617,9 @@ def to_generalized_datetime(
# Handle both scalar and array cases for `pd.isna()`.
if hasattr(datetime_dates, "all"):
# datetime_dates is a Series or array-like
all_na = pd.isna(datetime_dates).all()
all_na = cast(Any, pd.isna(datetime_dates)).all()
datetime_example = (
datetime_dates.tolist()[format_example_index]
cast(Any, datetime_dates).tolist()[format_example_index]
if hasattr(datetime_dates, "tolist")
else datetime_dates
)
Expand All @@ -632,25 +633,25 @@ def to_generalized_datetime(
and hasattr(datetime_example, "strftime")
and datetime_example.strftime("%Y-%m-%d") == date_example
):
return datetime_dates
return cast(Union[pd.Series, pd.Index], datetime_dates)
shift_func = _shift_to_period_end(date_example)
if shift_func is not None:
if hasattr(datetime_dates, "map"):
datetime_dates = datetime_dates.map(shift_func)
datetime_dates = cast(Any, datetime_dates).map(shift_func)
else:
# For scalar case, apply the shift function directly
datetime_dates = shift_func(datetime_dates)
return datetime_dates
return cast(Union[pd.Series, pd.Index], datetime_dates)
# If standard conversion fails, attempt our own conversion.
date_standard = date_standard or "standard"
format_determination_output = _determine_date_format(
date_example, date_standard
)
if format_determination_output is None:
return datetime_dates
return cast(Union[pd.Series, pd.Index], datetime_dates)
format_, date_modification_func = format_determination_output
dates = dates.map(date_modification_func)
return pd.to_datetime(dates, format=format_)
return cast(Union[pd.Series, pd.Index], pd.to_datetime(dates, format=format_))


def _handle_incorrect_conversions(
Expand Down Expand Up @@ -722,7 +723,7 @@ def shift_to_year_end(x: StrictDatetime) -> StrictDatetime:
# shift the month aliases by one to get the correct order.
# E.g., `calendar.month_name[1:]` is `['January', 'February', ...]` and
# `calendar.month_abbr[1:]` is `['Jan', 'Feb', ...]`.
month_aliases = calendar.month_name[1:] + calendar.month_abbr[1:]
month_aliases = list(calendar.month_name[1:]) + list(calendar.month_abbr[1:])
pattern = re.compile("|".join(month_aliases), re.IGNORECASE)
match = pattern.search(date)
if match is None:
Expand Down
6 changes: 5 additions & 1 deletion helpers/hdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import logging
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Expand All @@ -23,6 +24,9 @@

import helpers.hdbg as hdbg

if TYPE_CHECKING:
from config_root.config.config_ import Config

_LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -71,7 +75,7 @@ def extract_leaf_values(nested: Dict[Any, Any], key: Any) -> Dict[Any, Any]:


def typed_get(
dict_: Union[Dict, "Config"], # noqa: F821
dict_: Union[Dict, "Config"], # noqa: F821 # type: ignore
key: Any,
default_value: Optional[Any] = _NO_VALUE_SPECIFIED,
*,
Expand Down
26 changes: 13 additions & 13 deletions helpers/hdockerized_executables.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,18 +247,18 @@ def convert_pandoc_cmd_to_arguments(cmd: str) -> Dict[str, Any]:
:return: A dictionary with the parsed arguments.
"""
# Use shlex.split to tokenize the string like a shell would.
cmd = shlex.split(cmd)
cmd_list = shlex.split(cmd)
# Remove the newline character that come from multiline commands with `\n`.
cmd = [arg for arg in cmd if arg != "\n"]
cmd_list = [arg for arg in cmd_list if arg != "\n"]
_LOG.debug(hprint.to_str("cmd"))
# The first option is the executable.
hdbg.dassert_eq(cmd[0], "pandoc")
hdbg.dassert_eq(cmd_list[0], "pandoc")
# pandoc parser is difficult to emulate with `argparse`, since pandoc allows
# the input file to be anywhere in the command line options. In our case we
# don't know all the possible command line options so for simplicity we
# assume that the first option is always the input file.
in_file_path = cmd[1]
cmd = cmd[2:]
in_file_path = cmd_list[1]
cmd_list = cmd_list[2:]
_LOG.debug(hprint.to_str("cmd"))
#
parser = argparse.ArgumentParser()
Expand All @@ -267,7 +267,7 @@ def convert_pandoc_cmd_to_arguments(cmd: str) -> Dict[str, Any]:
parser.add_argument("--template", default=None)
parser.add_argument("--extract-media", default=None)
# Parse known arguments and capture the rest.
args, unknown_args = parser.parse_known_args(cmd)
args, unknown_args = parser.parse_known_args(cmd_list)
_LOG.debug(hprint.to_str("args unknown_args"))
# Filter out the option terminator if present.
# Remove the `--` option terminator to treat `--option-after-terminator` as a regular argument, not as an option.
Expand Down Expand Up @@ -707,31 +707,31 @@ def convert_latex_cmd_to_arguments(cmd: str) -> Dict[str, Any]:
:return: A dictionary with the parsed arguments.
"""
# Use shlex.split to tokenize the string like a shell would.
cmd = shlex.split(cmd)
cmd_list = shlex.split(cmd)
# Remove the newline character that come from multiline commands with `\n`.
cmd = [arg for arg in cmd if arg != "\n"]
cmd_list = [arg for arg in cmd_list if arg != "\n"]
_LOG.debug(hprint.to_str("cmd"))
# The first option is the executable.
hdbg.dassert_eq(cmd[0], "pdflatex")
hdbg.dassert_eq(cmd_list[0], "pdflatex")
# We assume that the first option is always the input file.
in_file_path = cmd[-1]
in_file_path = cmd_list[-1]
hdbg.dassert(
not in_file_path.startswith("-"),
"Invalid input file '%s'",
in_file_path,
)
hdbg.dassert_file_exists(in_file_path)
cmd = cmd[1:-1]
cmd_list = cmd_list[1:-1]
_LOG.debug(hprint.to_str("cmd"))
#
parser = argparse.ArgumentParser()
parser.add_argument("--output-directory", required=True)
# Latex uses options like `-XYZ` which confuse `argparse` so we need to
# replace `-XYZ` with `--XYZ`.
cmd = [re.sub(r"^-", r"--", cmd_opts) for cmd_opts in cmd]
cmd_list = [re.sub(r"^-", r"--", cmd_opts) for cmd_opts in cmd_list]
_LOG.debug(hprint.to_str("cmd"))
# # Parse known arguments and capture the rest.
args, unknown_args = parser.parse_known_args(cmd)
args, unknown_args = parser.parse_known_args(cmd_list)
_LOG.debug(hprint.to_str("args unknown_args"))
# Return all the arguments in a dictionary with names that match the
# function signature of `run_dockerized_pandoc()`.
Expand Down
6 changes: 3 additions & 3 deletions helpers/henv.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def _get_library_version(lib_name: str) -> str:
return version


def _get_package_info() -> Tuple[List[str], int]:
def _get_package_info() -> Tuple[str, int]:
"""
Get package version information.

Expand Down Expand Up @@ -559,8 +559,8 @@ def get_system_signature(git_commit_type: str = "all") -> Tuple[str, int]:
hprint.dassert_one_trailing_newline(txt_tmp)
txt.append(txt_tmp)
#
txt = hprint.to_info("System signature", txt)
return txt, failed_imports
txt_str: str = hprint.to_info("System signature", txt)
return txt_str, failed_imports


# #############################################################################
Expand Down
9 changes: 5 additions & 4 deletions helpers/hgit.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import random
import re
import string
from typing import List, Optional, Tuple
from typing import Any, List, Optional, Tuple, cast

import helpers.hdbg as hdbg
import helpers.hprint as hprint
Expand Down Expand Up @@ -242,7 +242,8 @@ def find_git_root(path: str = ".") -> str:
)
# Update the path to the parent directory for the next iteration.
path = parent
return git_root_dir
hdbg.dassert_is_not(git_root_dir, None, "Git root directory should have been found")
return str(git_root_dir)


# #############################################################################
Expand Down Expand Up @@ -853,7 +854,7 @@ def get_path_from_git_root(
super_module,
ret,
)
return ret
return str(ret)


# TODO(gp): Rewrite this function in a better way.
Expand Down Expand Up @@ -1394,7 +1395,7 @@ def does_branch_exist(
exists_tmp = does_branch_exist(
branch_name, mode_tmp, dir_name=dir_name
)
exists |= exists_tmp
exists = exists or exists_tmp
return exists
#
hdbg.dassert_in(mode, ("git_local", "git_remote", "github"))
Expand Down
Loading