class IMDBReviews(BaseReader):
def __init__(
self,
movie_name_year: str,
webdriver_engine: str = "google",
generate_csv: bool = False,
multithreading: bool = False,
max_workers: int = 0,
reviews_folder: str = "movie_reviews",
):
"""获取电影的IMDB评论。
Args:
movie_name_year(str):电影名称及年份
webdriver_engine(str,可选):要使用的webdriver引擎。默认为“google”。
generate_csv(bool,可选):是否生成csv。默认为False。
multithreading(bool,可选):是否使用多线程。默认为False。
max_workers(int,可选):如果使用多线程,工作线程的数量。默认为0。
"""
assert webdriver_engine in [
"google",
"edge",
"firefox",
], "The webdriver should be in ['google','edge','firefox']"
self.movie_name_year = movie_name_year
self.webdriver_engine = webdriver_engine
self.generate_csv = generate_csv
self.multithreading = multithreading
self.max_workers = max_workers
self.reviews_folder = reviews_folder
def load_data(self) -> List[Document]:
"""从IMDB网站电影评论中抓取数据。
返回:
List[Document]:llama索引中带有日期和评分作为额外信息的文档对象
"""
(
reviews_date,
reviews_title,
reviews_comment,
reviews_rating,
reviews_link,
review_helpful,
review_total_votes,
review_if_spoiler,
) = main_scraper(
self.movie_name_year,
self.webdriver_engine,
self.generate_csv,
self.multithreading,
self.max_workers,
self.reviews_folder,
)
all_docs = []
for i in range(len(reviews_date)):
all_docs.append(
Document(
text=reviews_title[i] + " " + reviews_comment[i],
extra_info={
"date": reviews_date[i],
"rating": reviews_rating[i],
"link": reviews_link[i],
"found_helpful_votes": review_helpful[i],
"total_votes": review_total_votes[i],
"spolier": review_if_spoiler[i],
},
)
)
return all_docs