Kaltura esearch

初始化文件。

KalturaESearchReader #

Bases: BaseReader

Kaltura eSearch API Reader.

Source code in llama_index/readers/kaltura_esearch/base.py

class KalturaESearchReader(BaseReader):
    """Kaltura eSearch API Reader."""

    def __init__(
        self,
        partner_id: int = 0,
        api_secret: str = "INSERT_YOUR_ADMIN_SECRET",
        user_id: str = "INSERT_YOUR_USER_ID",
        ks_type: int = 2,
        ks_expiry: int = 86400,
        ks_privileges: str = "disableentitlement",
        kaltura_api_endpoint: str = "https://cdnapi-ev.kaltura.com/",
        request_timeout: int = 500,
        should_log_api_calls: bool = False,
    ) -> None:
        """初始化一个新的KalturaESearchReader实例。

Args:
    partner_id（int）：Kaltura账户ID。默认为0。
    api_secret（str）：Kaltura API管理员密钥。默认为"INSERT_YOUR_ADMIN_SECRET"。
    user_id（str）：执行和记录所有API操作的用户ID。默认为"INSERT_YOUR_USER_ID"。
    ks_type（int）：Kaltura会话类型。默认为2。
    ks_expiry（int）：Kaltura会话的有效期（以秒为单位）。默认为86400。
    ks_privileges（str）：Kaltura会话权限。默认为"disableentitlement"。
    kaltura_api_endpoint（str）：Kaltura API端点。默认为"https://cdnapi-ev.kaltura.com/"。
    request_timeout（int）：API请求超时时间（以秒为单位）。默认为500。
    should_log_api_calls（bool）：布尔值，确定是否记录Kaltura请求。默认为False。
"""
        self.partner_id = partner_id
        self.api_secret = api_secret
        self.user_id = user_id
        self.ks_type = ks_type
        self.ks_expiry = ks_expiry
        self.ks_privileges = ks_privileges
        self.kaltura_api_endpoint = kaltura_api_endpoint
        self.request_timeout = request_timeout
        self.should_log_api_calls = should_log_api_calls
        # Kaltura libraries will be loaded when they are needed
        self._kaltura_loaded = False

    def _load_kaltura(self):
        """加载Kaltura库并初始化Kaltura客户端。"""
        from KalturaClient import KalturaClient
        from KalturaClient.Base import IKalturaLogger, KalturaConfiguration
        from KalturaClient.Plugins.Core import KalturaSessionType

        class KalturaLogger(IKalturaLogger):
            def log(self, msg):
                logging.info(msg)

        try:
            self.config = KalturaConfiguration()
            self.config.requestTimeout = self.request_timeout
            self.config.serviceUrl = self.kaltura_api_endpoint
            if self.should_log_api_calls:
                self.config.setLogger(KalturaLogger())
            self.client = KalturaClient(self.config)
            if self.ks_type is None:
                self.ks_type = KalturaSessionType.ADMIN
            self.ks = self.client.generateSessionV2(
                self.api_secret,
                self.user_id,
                self.ks_type,
                self.partner_id,
                self.ks_expiry,
                self.ks_privileges,
            )
            self.client.setKs(self.ks)
            self._kaltura_loaded = True
        except Exception:
            logger.error("Kaltura Auth failed, check your credentials")

    def _load_from_search_params(
        self, search_params, with_captions: bool = True, max_entries: int = 10
    ) -> List[Dict[str, Any]]:
        """加载搜索参数并返回条目列表。

Args:
    search_params: Kaltura eSearch的搜索参数。
    with_captions (bool): 如果为True，则条目将包括字幕。
    max_entries (int): 要返回的条目的最大数量。

Returns:
    list: 条目列表，以字典形式表示，
    如果需要字幕，则entry_info将包括所有元数据，text将包括转录内容，
    否则info只包括entry_id，text包括所有元数据。
"""
        from KalturaClient.Plugins.Core import KalturaPager

        try:
            entries = []
            pager = KalturaPager()
            pager.pageIndex = 1
            pager.pageSize = max_entries
            response = self.client.elasticSearch.eSearch.searchEntry(
                search_params, pager
            )

            for search_result in response.objects:
                entry = search_result.object
                items_data = search_result.itemsData

                entry_info = {
                    "entry_id": str(entry.id),
                    "entry_name": str(entry.name),
                    "entry_description": str(entry.description or ""),
                    "entry_media_type": int(entry.mediaType.value or 0),
                    "entry_media_date": int(entry.createdAt or 0),
                    "entry_ms_duration": int(entry.msDuration or 0),
                    "entry_last_played_at": int(entry.lastPlayedAt or 0),
                    "entry_application": str(entry.application or ""),
                    "entry_tags": str(entry.tags or ""),
                    "entry_reference_id": str(entry.referenceId or ""),
                }

                if with_captions:
                    caption_search_result = items_data[0].items[0]
                    if hasattr(caption_search_result, "captionAssetId"):
                        # TODO: change this to fetch captions per language, or as for a specific language code
                        caption_asset_id = caption_search_result.captionAssetId
                        entry_dict = {
                            "video_transcript": self._get_json_transcript(
                                caption_asset_id
                            )
                        }
                    else:
                        entry_dict = entry_info.copy()
                        entry_info = {"entry_id": str(entry.id)}
                else:
                    entry_dict = entry_info.copy()
                    entry_info = {"entry_id": str(entry.id)}

                entry_doc = Document(text=json.dumps(entry_dict), extra_info=entry_info)
                entries.append(entry_doc)

            return entries

        except Exception as e:
            if e.code == "INVALID_KS":
                raise ValueError(f"Kaltura Auth failed, check your credentials: {e}")
            logger.error(f"An error occurred while loading with search params: {e}")
            return []

    def _get_json_transcript(self, caption_asset_id):
        """从给定的caption_asset_id获取json转录/字幕。

Args:
    caption_asset_id：包含要获取json转录的字幕的字幕资产的ID

Returns:
    字幕的JSON转录，如果未找到或发生错误，则返回空字典。
"""
        # TODO: change this to fetch captions per language, or as for a specific language code
        try:
            cap_json_url = self.client.caption.captionAsset.serveAsJson(
                caption_asset_id
            )
            return requests.get(cap_json_url).json()
        except Exception as e:
            logger.error(f"An error occurred while getting captions: {e}")
            return {}

    def load_data(
        self,
        search_params: Any = None,
        search_operator_and: bool = True,
        free_text: Optional[str] = None,
        category_ids: Optional[str] = None,
        with_captions: bool = True,
        max_entries: int = 5,
    ) -> List[Dict[str, Any]]:
        """从Kaltura根据搜索参数加载数据。
该函数返回一个字典列表。
每个字典表示一个媒体条目，其中键是字符串（字段名称），值可以是任何类型。

Args:
    search_params：KalturaESearchEntryParams类型的搜索参数，带有预设的搜索查询。如果未提供，将使用其他参数构造搜索查询。
    search_operator_and：如果为True，则构造的搜索查询将在查询过滤器之间使用AND运算符，如果为False，则运算符将为OR。
    free_text：如果提供，将用作在Kaltura中进行搜索的自由文本查询。
    category_ids：如果提供，将仅搜索位于这些类别ID内的条目。
    withCaptions：确定是否还要从Kaltura下载字幕/剧本内容。
    maxEntries：设置从Kaltura拉取的条目的最大数量，介于0到500之间（Kaltura中的最大pageSize）。

Returns:
    List[Dict[str, Any]]：表示Kaltura媒体条目的字典列表，具有以下字段：
    entry_id:str, entry_name:str, entry_description:str, entry_captions:JSON,
    entry_media_type:int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,
    entry_application:str, entry_tags:str, entry_reference_id:str。
    如果with_captions为False，则将entry_info设置为仅包括entry_id，将entry_dict设置为包括所有其他条目信息。
    如果with_captions为True，则将entry_info设置为包括所有条目信息，并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。
"""
        from KalturaClient.Plugins.ElasticSearch import (
            KalturaCategoryEntryStatus,
            KalturaESearchCaptionFieldName,
            KalturaESearchCaptionItem,
            KalturaESearchCategoryEntryFieldName,
            KalturaESearchCategoryEntryItem,
            KalturaESearchEntryOperator,
            KalturaESearchEntryParams,
            KalturaESearchItemType,
            KalturaESearchOperatorType,
            KalturaESearchUnifiedItem,
        )

        # Load and initialize the Kaltura client
        if not self._kaltura_loaded:
            self._load_kaltura()

        # Validate input parameters:
        if search_params is None:
            search_params = KalturaESearchEntryParams()
            # Create an AND/OR relationship between the following search queries -
            search_params.searchOperator = KalturaESearchEntryOperator()
            if search_operator_and:
                search_params.searchOperator.operator = (
                    KalturaESearchOperatorType.AND_OP
                )
            else:
                search_params.searchOperator.operator = KalturaESearchOperatorType.OR_OP
            search_params.searchOperator.searchItems = []
            # Find only entries that have captions -
            if with_captions:
                caption_item = KalturaESearchCaptionItem()
                caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
                caption_item.itemType = KalturaESearchItemType.EXISTS
                search_params.searchOperator.searchItems.append(caption_item)
            # Find only entries that are inside these category IDs -
            if category_ids is not None:
                category_item = KalturaESearchCategoryEntryItem()
                category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
                category_item.fieldName = KalturaESearchCategoryEntryFieldName.FULL_IDS
                category_item.addHighlight = False
                category_item.itemType = KalturaESearchItemType.EXACT_MATCH
                category_item.searchTerm = category_ids
                search_params.searchOperator.searchItems.append(category_item)
            # Find only entries that has this freeText found in them -
            if free_text is not None:
                unified_item = KalturaESearchUnifiedItem()
                unified_item.searchTerm = free_text
                unified_item.itemType = KalturaESearchItemType.PARTIAL
                search_params.searchOperator.searchItems.append(unified_item)

        return self._load_from_search_params(search_params, with_captions, max_entries)

load_data #

load_data(
    search_params: Any = None,
    search_operator_and: bool = True,
    free_text: Optional[str] = None,
    category_ids: Optional[str] = None,
    with_captions: bool = True,
    max_entries: int = 5,
) -> List[Dict[str, Any]]

从Kaltura根据搜索参数加载数据。该函数返回一个字典列表。每个字典表示一个媒体条目，其中键是字符串（字段名称），值可以是任何类型。

Returns:

Name	Type	Description
	`List[Dict[str, Any]]`	List[Dict[str, Any]]：表示Kaltura媒体条目的字典列表，具有以下字段：
`entry_id`	`List[Dict[str, Any]]`	str, entry_name:str, entry_description:str, entry_captions:JSON,
`entry_media_type`	`List[Dict[str, Any]]`	int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,
`entry_application`	`List[Dict[str, Any]]`	str, entry_tags:str, entry_reference_id:str。
	`List[Dict[str, Any]]`	如果with_captions为False，则将entry_info设置为仅包括entry_id，将entry_dict设置为包括所有其他条目信息。
	`List[Dict[str, Any]]`	如果with_captions为True，则将entry_info设置为包括所有条目信息，并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。

Source code in llama_index/readers/kaltura_esearch/base.py

    def load_data(
        self,
        search_params: Any = None,
        search_operator_and: bool = True,
        free_text: Optional[str] = None,
        category_ids: Optional[str] = None,
        with_captions: bool = True,
        max_entries: int = 5,
    ) -> List[Dict[str, Any]]:
        """从Kaltura根据搜索参数加载数据。
该函数返回一个字典列表。
每个字典表示一个媒体条目，其中键是字符串（字段名称），值可以是任何类型。

Args:
    search_params：KalturaESearchEntryParams类型的搜索参数，带有预设的搜索查询。如果未提供，将使用其他参数构造搜索查询。
    search_operator_and：如果为True，则构造的搜索查询将在查询过滤器之间使用AND运算符，如果为False，则运算符将为OR。
    free_text：如果提供，将用作在Kaltura中进行搜索的自由文本查询。
    category_ids：如果提供，将仅搜索位于这些类别ID内的条目。
    withCaptions：确定是否还要从Kaltura下载字幕/剧本内容。
    maxEntries：设置从Kaltura拉取的条目的最大数量，介于0到500之间（Kaltura中的最大pageSize）。

Returns:
    List[Dict[str, Any]]：表示Kaltura媒体条目的字典列表，具有以下字段：
    entry_id:str, entry_name:str, entry_description:str, entry_captions:JSON,
    entry_media_type:int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,
    entry_application:str, entry_tags:str, entry_reference_id:str。
    如果with_captions为False，则将entry_info设置为仅包括entry_id，将entry_dict设置为包括所有其他条目信息。
    如果with_captions为True，则将entry_info设置为包括所有条目信息，并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。
"""
        from KalturaClient.Plugins.ElasticSearch import (
            KalturaCategoryEntryStatus,
            KalturaESearchCaptionFieldName,
            KalturaESearchCaptionItem,
            KalturaESearchCategoryEntryFieldName,
            KalturaESearchCategoryEntryItem,
            KalturaESearchEntryOperator,
            KalturaESearchEntryParams,
            KalturaESearchItemType,
            KalturaESearchOperatorType,
            KalturaESearchUnifiedItem,
        )

        # Load and initialize the Kaltura client
        if not self._kaltura_loaded:
            self._load_kaltura()

        # Validate input parameters:
        if search_params is None:
            search_params = KalturaESearchEntryParams()
            # Create an AND/OR relationship between the following search queries -
            search_params.searchOperator = KalturaESearchEntryOperator()
            if search_operator_and:
                search_params.searchOperator.operator = (
                    KalturaESearchOperatorType.AND_OP
                )
            else:
                search_params.searchOperator.operator = KalturaESearchOperatorType.OR_OP
            search_params.searchOperator.searchItems = []
            # Find only entries that have captions -
            if with_captions:
                caption_item = KalturaESearchCaptionItem()
                caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
                caption_item.itemType = KalturaESearchItemType.EXISTS
                search_params.searchOperator.searchItems.append(caption_item)
            # Find only entries that are inside these category IDs -
            if category_ids is not None:
                category_item = KalturaESearchCategoryEntryItem()
                category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
                category_item.fieldName = KalturaESearchCategoryEntryFieldName.FULL_IDS
                category_item.addHighlight = False
                category_item.itemType = KalturaESearchItemType.EXACT_MATCH
                category_item.searchTerm = category_ids
                search_params.searchOperator.searchItems.append(category_item)
            # Find only entries that has this freeText found in them -
            if free_text is not None:
                unified_item = KalturaESearchUnifiedItem()
                unified_item.searchTerm = free_text
                unified_item.itemType = KalturaESearchItemType.PARTIAL
                search_params.searchOperator.searchItems.append(unified_item)

        return self._load_from_search_params(search_params, with_captions, max_entries)