Skip to content

Imdb review

IMDBReviews #

Bases: BaseReader

Source code in llama_index/readers/imdb_review/base.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class IMDBReviews(BaseReader):
    def __init__(
        self,
        movie_name_year: str,
        webdriver_engine: str = "google",
        generate_csv: bool = False,
        multithreading: bool = False,
        max_workers: int = 0,
        reviews_folder: str = "movie_reviews",
    ):
        """获取电影的IMDB评论。

Args:
    movie_name_year(str):电影名称及年份
    webdriver_engine(str,可选):要使用的webdriver引擎。默认为“google”。
    generate_csv(bool,可选):是否生成csv。默认为False。
    multithreading(bool,可选):是否使用多线程。默认为False。
    max_workers(int,可选):如果使用多线程,工作线程的数量。默认为0。
"""
        assert webdriver_engine in [
            "google",
            "edge",
            "firefox",
        ], "The webdriver should be in ['google','edge','firefox']"
        self.movie_name_year = movie_name_year
        self.webdriver_engine = webdriver_engine
        self.generate_csv = generate_csv
        self.multithreading = multithreading
        self.max_workers = max_workers
        self.reviews_folder = reviews_folder

    def load_data(self) -> List[Document]:
        """从IMDB网站电影评论中抓取数据。

返回:
    List[Document]:llama索引中带有日期和评分作为额外信息的文档对象
"""
        (
            reviews_date,
            reviews_title,
            reviews_comment,
            reviews_rating,
            reviews_link,
            review_helpful,
            review_total_votes,
            review_if_spoiler,
        ) = main_scraper(
            self.movie_name_year,
            self.webdriver_engine,
            self.generate_csv,
            self.multithreading,
            self.max_workers,
            self.reviews_folder,
        )

        all_docs = []
        for i in range(len(reviews_date)):
            all_docs.append(
                Document(
                    text=reviews_title[i] + " " + reviews_comment[i],
                    extra_info={
                        "date": reviews_date[i],
                        "rating": reviews_rating[i],
                        "link": reviews_link[i],
                        "found_helpful_votes": review_helpful[i],
                        "total_votes": review_total_votes[i],
                        "spolier": review_if_spoiler[i],
                    },
                )
            )
        return all_docs

load_data #

load_data() -> List[Document]

从IMDB网站电影评论中抓取数据。

返回: List[Document]:llama索引中带有日期和评分作为额外信息的文档对象

Source code in llama_index/readers/imdb_review/base.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    def load_data(self) -> List[Document]:
        """从IMDB网站电影评论中抓取数据。

返回:
    List[Document]:llama索引中带有日期和评分作为额外信息的文档对象
"""
        (
            reviews_date,
            reviews_title,
            reviews_comment,
            reviews_rating,
            reviews_link,
            review_helpful,
            review_total_votes,
            review_if_spoiler,
        ) = main_scraper(
            self.movie_name_year,
            self.webdriver_engine,
            self.generate_csv,
            self.multithreading,
            self.max_workers,
            self.reviews_folder,
        )

        all_docs = []
        for i in range(len(reviews_date)):
            all_docs.append(
                Document(
                    text=reviews_title[i] + " " + reviews_comment[i],
                    extra_info={
                        "date": reviews_date[i],
                        "rating": reviews_rating[i],
                        "link": reviews_link[i],
                        "found_helpful_votes": review_helpful[i],
                        "total_votes": review_total_votes[i],
                        "spolier": review_if_spoiler[i],
                    },
                )
            )
        return all_docs