diff --git a/setup.py b/setup.py index a7cca41..efa1423 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "websockets>=11.0.3", ], extras_require={ - "capture": ["videodb-capture-bin>=0.2.8"], + "capture": ["videodb-capture-bin>=0.2.10"], }, classifiers=[ "Intended Audience :: Developers", diff --git a/videodb/__about__.py b/videodb/__about__.py index c1f5177..0cdd560 100644 --- a/videodb/__about__.py +++ b/videodb/__about__.py @@ -2,7 +2,7 @@ -__version__ = "0.4.2" +__version__ = "0.4.3" __title__ = "videodb" __author__ = "videodb" __email__ = "contact@videodb.io" diff --git a/videodb/asset.py b/videodb/asset.py index 6061b4b..805a862 100644 --- a/videodb/asset.py +++ b/videodb/asset.py @@ -37,8 +37,8 @@ def __init__( end: Optional[float] = None, ) -> None: super().__init__(asset_id) - self.start: int = start - self.end: Union[int, None] = end + self.start: Optional[float] = start + self.end: Optional[float] = end def to_json(self) -> dict: return copy.deepcopy(self.__dict__) @@ -63,8 +63,8 @@ def __init__( fade_out_duration: Optional[Union[int, float]] = 0, ): super().__init__(asset_id) - self.start: int = start - self.end: Union[int, None] = end + self.start: Optional[float] = start + self.end: Optional[float] = end self.disable_other_tracks: bool = disable_other_tracks self.fade_in_duration: Union[int, float] = validate_max_supported( fade_in_duration, MaxSupported.fade_duration, "fade_in_duration" diff --git a/videodb/capture.py b/videodb/capture.py index 6626c74..dca7e59 100644 --- a/videodb/capture.py +++ b/videodb/capture.py @@ -91,7 +91,6 @@ def to_dict(self) -> Dict[str, Any]: "channel_id": self.id, "type": self.type, "name": self.name, - "record": True, "store": self.store, "is_primary": self.is_primary, } @@ -134,21 +133,24 @@ def __init__( mics: List[AudioChannel] = None, displays: List[VideoChannel] = None, system_audio: List[AudioChannel] = None, + cameras: List[VideoChannel] = None, ): self.mics: ChannelList = ChannelList(mics or []) self.displays: ChannelList = ChannelList(displays or []) self.system_audio: ChannelList = ChannelList(system_audio or []) + self.cameras: ChannelList = ChannelList(cameras or []) def __repr__(self): return ( f"Channels(" f"mics={len(self.mics)}, " f"displays={len(self.displays)}, " - f"system_audio={len(self.system_audio)})" + f"system_audio={len(self.system_audio)}, " + f"cameras={len(self.cameras)})" ) def all(self) -> List[Channel]: - """Return a flat list of all channels.""" + """Return a flat list of all capturable channels (excludes cameras).""" return list(self.mics) + list(self.displays) + list(self.system_audio) @@ -334,30 +336,34 @@ async def list_channels(self) -> Channels: mics = [] displays = [] system_audio = [] - + cameras = [] + for ch in raw_channels: c_type = ch.get("type") c_id = ch.get("channel_id") or ch.get("id") c_name = ch.get("name", "") - + if not c_id: logger.warning(f"Skipping channel with missing ID: {ch}") continue - # Categorize based on type and name patterns - if c_type == "video": + # Categorize based on channel ID prefix + if c_id.startswith("mic:"): + mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("display:") or c_id.startswith("screen:"): displays.append(VideoChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("system_audio:"): + system_audio.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("camera:"): + cameras.append(VideoChannel(id=c_id, name=c_name, client=self)) elif c_type == "audio": - # Distinguish between mic and system audio based on common patterns - name_lower = c_name.lower() - if "system" in name_lower or "output" in name_lower or "speaker" in name_lower: - system_audio.append(AudioChannel(id=c_id, name=c_name, client=self)) - else: - mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_type == "video": + displays.append(VideoChannel(id=c_id, name=c_name, client=self)) else: logger.debug(f"Unknown channel type '{c_type}' for channel '{c_name}'") - - return Channels(mics=mics, displays=displays, system_audio=system_audio) + + return Channels(mics=mics, displays=displays, system_audio=system_audio, cameras=cameras) async def start_session( self, diff --git a/videodb/editor.py b/videodb/editor.py index 5c40282..daaf1e9 100644 --- a/videodb/editor.py +++ b/videodb/editor.py @@ -1,5 +1,7 @@ import json +import logging import requests +import warnings from typing import List, Optional, Union from enum import Enum @@ -8,6 +10,8 @@ from videodb.exceptions import InvalidRequestError +logger = logging.getLogger(__name__) + MAX_PAYLOAD_SIZE = 100 * 1024 @@ -840,6 +844,11 @@ def __init__( ): """Initialize a CaptionAsset instance. + .. note:: + When using ``src="auto"``, the video must be indexed first + (e.g. via ``video.index_spoken_words()``) so that a transcript + is available for caption generation. + :param str src: Caption source ("auto" for auto-generated or base64 encoded ass string) :param FontStyling font: (optional) Font styling properties :param str primary_color: Primary text color in ASS format (default: "&H00FFFFFF") @@ -849,6 +858,12 @@ def __init__( :param Positioning position: (optional) Caption positioning properties :param CaptionAnimation animation: (optional) Caption animation effect """ + if src == "auto": + warnings.warn( + "CaptionAsset(src='auto'): the video must be indexed " + "(e.g. video.index_spoken_words()) for captions to be generated.", + stacklevel=2, + ) self.src = src self.font = font if font is not None else FontStyling() self.primary_color = primary_color diff --git a/videodb/search.py b/videodb/search.py index 94730ec..f2b9207 100644 --- a/videodb/search.py +++ b/videodb/search.py @@ -48,6 +48,8 @@ def _format_results(self): scene_index_id=doc.get("scene_index_id"), scene_index_name=doc.get("scene_index_name"), metadata=doc.get("metadata"), + stream_url=doc.get("stream_link"), + player_url=doc.get("player_url"), ) ) diff --git a/videodb/shot.py b/videodb/shot.py index b261077..82d90cd 100644 --- a/videodb/shot.py +++ b/videodb/shot.py @@ -35,6 +35,8 @@ def __init__( scene_index_id: Optional[str] = None, scene_index_name: Optional[str] = None, metadata: Optional[dict] = None, + stream_url: Optional[str] = None, + player_url: Optional[str] = None, ) -> None: self._connection = _connection self.video_id = video_id @@ -47,8 +49,8 @@ def __init__( self.scene_index_id = scene_index_id self.scene_index_name = scene_index_name self.metadata = metadata - self.stream_url = None - self.player_url = None + self.stream_url = stream_url + self.player_url = player_url def __repr__(self) -> str: repr_str = ( diff --git a/videodb/video.py b/videodb/video.py index 3c7126d..367ba87 100644 --- a/videodb/video.py +++ b/videodb/video.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List, Dict, Tuple, Any +from typing import Literal, Optional, Union, List, Dict, Tuple, Any from videodb._utils._video import play_stream from videodb._constants import ( ApiPath, @@ -249,10 +249,12 @@ def get_transcript_text( def generate_transcript( self, force: bool = None, + language_code: Optional[str] = None, ) -> str: """Generate transcript for the video. :param bool force: Force generate new transcript + :param str language_code: (optional) Language code of the video :return: Full transcript text as string :rtype: str """ @@ -260,6 +262,7 @@ def generate_transcript( path=f"{ApiPath.video}/{self.id}/{ApiPath.transcription}", data={ "force": True if force else False, + "language_code": language_code, }, ) transcript = transcript_data.get("word_timestamps", []) @@ -702,9 +705,9 @@ def add_subtitle(self, style: SubtitleStyle = SubtitleStyle()) -> str: def clip( self, prompt: str, - content_type: str, - model_name: str, - ) -> str: + content_type: Literal["spoken", "visual", "multimodal"], + model_name: Literal["basic", "pro", "ultra"], + ) -> SearchResult: """Generate a clip from the video using a prompt. :param str prompt: Prompt to generate the clip :param str content_type: Content type for the clip. Valid options: "spoken", "visual", "multimodal"