Skip to content

Kaltura esearch

初始化文件。

KalturaESearchReader #

Bases: BaseReader

Kaltura eSearch API Reader.

Source code in llama_index/readers/kaltura_esearch/base.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
class KalturaESearchReader(BaseReader):
    """Kaltura eSearch API Reader."""

    def __init__(
        self,
        partner_id: int = 0,
        api_secret: str = "INSERT_YOUR_ADMIN_SECRET",
        user_id: str = "INSERT_YOUR_USER_ID",
        ks_type: int = 2,
        ks_expiry: int = 86400,
        ks_privileges: str = "disableentitlement",
        kaltura_api_endpoint: str = "https://cdnapi-ev.kaltura.com/",
        request_timeout: int = 500,
        should_log_api_calls: bool = False,
    ) -> None:
        """初始化一个新的KalturaESearchReader实例。

Args:
    partner_id(int):Kaltura账户ID。默认为0。
    api_secret(str):Kaltura API管理员密钥。默认为"INSERT_YOUR_ADMIN_SECRET"。
    user_id(str):执行和记录所有API操作的用户ID。默认为"INSERT_YOUR_USER_ID"。
    ks_type(int):Kaltura会话类型。默认为2。
    ks_expiry(int):Kaltura会话的有效期(以秒为单位)。默认为86400。
    ks_privileges(str):Kaltura会话权限。默认为"disableentitlement"。
    kaltura_api_endpoint(str):Kaltura API端点。默认为"https://cdnapi-ev.kaltura.com/"。
    request_timeout(int):API请求超时时间(以秒为单位)。默认为500。
    should_log_api_calls(bool):布尔值,确定是否记录Kaltura请求。默认为False。
"""
        self.partner_id = partner_id
        self.api_secret = api_secret
        self.user_id = user_id
        self.ks_type = ks_type
        self.ks_expiry = ks_expiry
        self.ks_privileges = ks_privileges
        self.kaltura_api_endpoint = kaltura_api_endpoint
        self.request_timeout = request_timeout
        self.should_log_api_calls = should_log_api_calls
        # Kaltura libraries will be loaded when they are needed
        self._kaltura_loaded = False

    def _load_kaltura(self):
        """加载Kaltura库并初始化Kaltura客户端。"""
        from KalturaClient import KalturaClient
        from KalturaClient.Base import IKalturaLogger, KalturaConfiguration
        from KalturaClient.Plugins.Core import KalturaSessionType

        class KalturaLogger(IKalturaLogger):
            def log(self, msg):
                logging.info(msg)

        try:
            self.config = KalturaConfiguration()
            self.config.requestTimeout = self.request_timeout
            self.config.serviceUrl = self.kaltura_api_endpoint
            if self.should_log_api_calls:
                self.config.setLogger(KalturaLogger())
            self.client = KalturaClient(self.config)
            if self.ks_type is None:
                self.ks_type = KalturaSessionType.ADMIN
            self.ks = self.client.generateSessionV2(
                self.api_secret,
                self.user_id,
                self.ks_type,
                self.partner_id,
                self.ks_expiry,
                self.ks_privileges,
            )
            self.client.setKs(self.ks)
            self._kaltura_loaded = True
        except Exception:
            logger.error("Kaltura Auth failed, check your credentials")

    def _load_from_search_params(
        self, search_params, with_captions: bool = True, max_entries: int = 10
    ) -> List[Dict[str, Any]]:
        """加载搜索参数并返回条目列表。

Args:
    search_params: Kaltura eSearch的搜索参数。
    with_captions (bool): 如果为True,则条目将包括字幕。
    max_entries (int): 要返回的条目的最大数量。

Returns:
    list: 条目列表,以字典形式表示,
    如果需要字幕,则entry_info将包括所有元数据,text将包括转录内容,
    否则info只包括entry_id,text包括所有元数据。
"""
        from KalturaClient.Plugins.Core import KalturaPager

        try:
            entries = []
            pager = KalturaPager()
            pager.pageIndex = 1
            pager.pageSize = max_entries
            response = self.client.elasticSearch.eSearch.searchEntry(
                search_params, pager
            )

            for search_result in response.objects:
                entry = search_result.object
                items_data = search_result.itemsData

                entry_info = {
                    "entry_id": str(entry.id),
                    "entry_name": str(entry.name),
                    "entry_description": str(entry.description or ""),
                    "entry_media_type": int(entry.mediaType.value or 0),
                    "entry_media_date": int(entry.createdAt or 0),
                    "entry_ms_duration": int(entry.msDuration or 0),
                    "entry_last_played_at": int(entry.lastPlayedAt or 0),
                    "entry_application": str(entry.application or ""),
                    "entry_tags": str(entry.tags or ""),
                    "entry_reference_id": str(entry.referenceId or ""),
                }

                if with_captions:
                    caption_search_result = items_data[0].items[0]
                    if hasattr(caption_search_result, "captionAssetId"):
                        # TODO: change this to fetch captions per language, or as for a specific language code
                        caption_asset_id = caption_search_result.captionAssetId
                        entry_dict = {
                            "video_transcript": self._get_json_transcript(
                                caption_asset_id
                            )
                        }
                    else:
                        entry_dict = entry_info.copy()
                        entry_info = {"entry_id": str(entry.id)}
                else:
                    entry_dict = entry_info.copy()
                    entry_info = {"entry_id": str(entry.id)}

                entry_doc = Document(text=json.dumps(entry_dict), extra_info=entry_info)
                entries.append(entry_doc)

            return entries

        except Exception as e:
            if e.code == "INVALID_KS":
                raise ValueError(f"Kaltura Auth failed, check your credentials: {e}")
            logger.error(f"An error occurred while loading with search params: {e}")
            return []

    def _get_json_transcript(self, caption_asset_id):
        """从给定的caption_asset_id获取json转录/字幕。

Args:
    caption_asset_id:包含要获取json转录的字幕的字幕资产的ID

Returns:
    字幕的JSON转录,如果未找到或发生错误,则返回空字典。
"""
        # TODO: change this to fetch captions per language, or as for a specific language code
        try:
            cap_json_url = self.client.caption.captionAsset.serveAsJson(
                caption_asset_id
            )
            return requests.get(cap_json_url).json()
        except Exception as e:
            logger.error(f"An error occurred while getting captions: {e}")
            return {}

    def load_data(
        self,
        search_params: Any = None,
        search_operator_and: bool = True,
        free_text: Optional[str] = None,
        category_ids: Optional[str] = None,
        with_captions: bool = True,
        max_entries: int = 5,
    ) -> List[Dict[str, Any]]:
        """从Kaltura根据搜索参数加载数据。
该函数返回一个字典列表。
每个字典表示一个媒体条目,其中键是字符串(字段名称),值可以是任何类型。

Args:
    search_params:KalturaESearchEntryParams类型的搜索参数,带有预设的搜索查询。如果未提供,将使用其他参数构造搜索查询。
    search_operator_and:如果为True,则构造的搜索查询将在查询过滤器之间使用AND运算符,如果为False,则运算符将为OR。
    free_text:如果提供,将用作在Kaltura中进行搜索的自由文本查询。
    category_ids:如果提供,将仅搜索位于这些类别ID内的条目。
    withCaptions:确定是否还要从Kaltura下载字幕/剧本内容。
    maxEntries:设置从Kaltura拉取的条目的最大数量,介于0到500之间(Kaltura中的最大pageSize)。

Returns:
    List[Dict[str, Any]]:表示Kaltura媒体条目的字典列表,具有以下字段:
    entry_id:str, entry_name:str, entry_description:str, entry_captions:JSON,
    entry_media_type:int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,
    entry_application:str, entry_tags:str, entry_reference_id:str。
    如果with_captions为False,则将entry_info设置为仅包括entry_id,将entry_dict设置为包括所有其他条目信息。
    如果with_captions为True,则将entry_info设置为包括所有条目信息,并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。
"""
        from KalturaClient.Plugins.ElasticSearch import (
            KalturaCategoryEntryStatus,
            KalturaESearchCaptionFieldName,
            KalturaESearchCaptionItem,
            KalturaESearchCategoryEntryFieldName,
            KalturaESearchCategoryEntryItem,
            KalturaESearchEntryOperator,
            KalturaESearchEntryParams,
            KalturaESearchItemType,
            KalturaESearchOperatorType,
            KalturaESearchUnifiedItem,
        )

        # Load and initialize the Kaltura client
        if not self._kaltura_loaded:
            self._load_kaltura()

        # Validate input parameters:
        if search_params is None:
            search_params = KalturaESearchEntryParams()
            # Create an AND/OR relationship between the following search queries -
            search_params.searchOperator = KalturaESearchEntryOperator()
            if search_operator_and:
                search_params.searchOperator.operator = (
                    KalturaESearchOperatorType.AND_OP
                )
            else:
                search_params.searchOperator.operator = KalturaESearchOperatorType.OR_OP
            search_params.searchOperator.searchItems = []
            # Find only entries that have captions -
            if with_captions:
                caption_item = KalturaESearchCaptionItem()
                caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
                caption_item.itemType = KalturaESearchItemType.EXISTS
                search_params.searchOperator.searchItems.append(caption_item)
            # Find only entries that are inside these category IDs -
            if category_ids is not None:
                category_item = KalturaESearchCategoryEntryItem()
                category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
                category_item.fieldName = KalturaESearchCategoryEntryFieldName.FULL_IDS
                category_item.addHighlight = False
                category_item.itemType = KalturaESearchItemType.EXACT_MATCH
                category_item.searchTerm = category_ids
                search_params.searchOperator.searchItems.append(category_item)
            # Find only entries that has this freeText found in them -
            if free_text is not None:
                unified_item = KalturaESearchUnifiedItem()
                unified_item.searchTerm = free_text
                unified_item.itemType = KalturaESearchItemType.PARTIAL
                search_params.searchOperator.searchItems.append(unified_item)

        return self._load_from_search_params(search_params, with_captions, max_entries)

load_data #

load_data(
    search_params: Any = None,
    search_operator_and: bool = True,
    free_text: Optional[str] = None,
    category_ids: Optional[str] = None,
    with_captions: bool = True,
    max_entries: int = 5,
) -> List[Dict[str, Any]]

从Kaltura根据搜索参数加载数据。 该函数返回一个字典列表。 每个字典表示一个媒体条目,其中键是字符串(字段名称),值可以是任何类型。

Returns:

Name Type Description
List[Dict[str, Any]]

List[Dict[str, Any]]:表示Kaltura媒体条目的字典列表,具有以下字段:

entry_id List[Dict[str, Any]]

str, entry_name:str, entry_description:str, entry_captions:JSON,

entry_media_type List[Dict[str, Any]]

int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,

entry_application List[Dict[str, Any]]

str, entry_tags:str, entry_reference_id:str。

List[Dict[str, Any]]

如果with_captions为False,则将entry_info设置为仅包括entry_id,将entry_dict设置为包括所有其他条目信息。

List[Dict[str, Any]]

如果with_captions为True,则将entry_info设置为包括所有条目信息,并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。

Source code in llama_index/readers/kaltura_esearch/base.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
    def load_data(
        self,
        search_params: Any = None,
        search_operator_and: bool = True,
        free_text: Optional[str] = None,
        category_ids: Optional[str] = None,
        with_captions: bool = True,
        max_entries: int = 5,
    ) -> List[Dict[str, Any]]:
        """从Kaltura根据搜索参数加载数据。
该函数返回一个字典列表。
每个字典表示一个媒体条目,其中键是字符串(字段名称),值可以是任何类型。

Args:
    search_params:KalturaESearchEntryParams类型的搜索参数,带有预设的搜索查询。如果未提供,将使用其他参数构造搜索查询。
    search_operator_and:如果为True,则构造的搜索查询将在查询过滤器之间使用AND运算符,如果为False,则运算符将为OR。
    free_text:如果提供,将用作在Kaltura中进行搜索的自由文本查询。
    category_ids:如果提供,将仅搜索位于这些类别ID内的条目。
    withCaptions:确定是否还要从Kaltura下载字幕/剧本内容。
    maxEntries:设置从Kaltura拉取的条目的最大数量,介于0到500之间(Kaltura中的最大pageSize)。

Returns:
    List[Dict[str, Any]]:表示Kaltura媒体条目的字典列表,具有以下字段:
    entry_id:str, entry_name:str, entry_description:str, entry_captions:JSON,
    entry_media_type:int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int,
    entry_application:str, entry_tags:str, entry_reference_id:str。
    如果with_captions为False,则将entry_info设置为仅包括entry_id,将entry_dict设置为包括所有其他条目信息。
    如果with_captions为True,则将entry_info设置为包括所有条目信息,并将entry_dict设置为仅包括通过self._get_captions(items_data)获取的条目剧本。
"""
        from KalturaClient.Plugins.ElasticSearch import (
            KalturaCategoryEntryStatus,
            KalturaESearchCaptionFieldName,
            KalturaESearchCaptionItem,
            KalturaESearchCategoryEntryFieldName,
            KalturaESearchCategoryEntryItem,
            KalturaESearchEntryOperator,
            KalturaESearchEntryParams,
            KalturaESearchItemType,
            KalturaESearchOperatorType,
            KalturaESearchUnifiedItem,
        )

        # Load and initialize the Kaltura client
        if not self._kaltura_loaded:
            self._load_kaltura()

        # Validate input parameters:
        if search_params is None:
            search_params = KalturaESearchEntryParams()
            # Create an AND/OR relationship between the following search queries -
            search_params.searchOperator = KalturaESearchEntryOperator()
            if search_operator_and:
                search_params.searchOperator.operator = (
                    KalturaESearchOperatorType.AND_OP
                )
            else:
                search_params.searchOperator.operator = KalturaESearchOperatorType.OR_OP
            search_params.searchOperator.searchItems = []
            # Find only entries that have captions -
            if with_captions:
                caption_item = KalturaESearchCaptionItem()
                caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
                caption_item.itemType = KalturaESearchItemType.EXISTS
                search_params.searchOperator.searchItems.append(caption_item)
            # Find only entries that are inside these category IDs -
            if category_ids is not None:
                category_item = KalturaESearchCategoryEntryItem()
                category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
                category_item.fieldName = KalturaESearchCategoryEntryFieldName.FULL_IDS
                category_item.addHighlight = False
                category_item.itemType = KalturaESearchItemType.EXACT_MATCH
                category_item.searchTerm = category_ids
                search_params.searchOperator.searchItems.append(category_item)
            # Find only entries that has this freeText found in them -
            if free_text is not None:
                unified_item = KalturaESearchUnifiedItem()
                unified_item.searchTerm = free_text
                unified_item.itemType = KalturaESearchItemType.PARTIAL
                search_params.searchOperator.searchItems.append(unified_item)

        return self._load_from_search_params(search_params, with_captions, max_entries)