# Prerequisites:# 1. Create a Google Cloud project# 2. Enable the Google Drive API:# https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com# 3. Authorize credentials for desktop app:# https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501# 4. For service accounts visit# https://cloud.google.com/iam/docs/service-accounts-createfrompathlibimportPathfromtypingimportAny,Dict,List,Optional,Sequence,Unionfromlangchain_core._api.deprecationimportdeprecatedfromlangchain_core.documentsimportDocumentfrompydanticimportBaseModel,model_validator,validatorfromlangchain_community.document_loaders.baseimportBaseLoaderSCOPES=["https://www.googleapis.com/auth/drive.readonly"]
[docs]@deprecated(since="0.0.32",removal="1.0",alternative_import="langchain_google_community.GoogleDriveLoader",)classGoogleDriveLoader(BaseLoader,BaseModel):"""Load Google Docs from `Google Drive`."""service_account_key:Path=Path.home()/".credentials"/"keys.json""""Path to the service account key file."""credentials_path:Path=Path.home()/".credentials"/"credentials.json""""Path to the credentials file."""token_path:Path=Path.home()/".credentials"/"token.json""""Path to the token file."""folder_id:Optional[str]=None"""The folder id to load from."""document_ids:Optional[List[str]]=None"""The document ids to load from."""file_ids:Optional[List[str]]=None"""The file ids to load from."""recursive:bool=False"""Whether to load recursively. Only applies when folder_id is given."""file_types:Optional[Sequence[str]]=None"""The file types to load. Only applies when folder_id is given."""load_trashed_files:bool=False"""Whether to load trashed files. Only applies when folder_id is given."""# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently# results in pydantic validation errorsfile_loader_cls:Any=None"""The file loader class to use."""file_loader_kwargs:Dict["str",Any]={}"""The file loader kwargs to use."""@model_validator(mode="before")@classmethoddefvalidate_inputs(cls,values:Dict[str,Any])->Any:"""Validate that either folder_id or document_ids is set, but not both."""ifvalues.get("folder_id")and(values.get("document_ids")orvalues.get("file_ids")):raiseValueError("Cannot specify both folder_id and document_ids nor ""folder_id and file_ids")if(notvalues.get("folder_id")andnotvalues.get("document_ids")andnotvalues.get("file_ids")):raiseValueError("Must specify either folder_id, document_ids, or file_ids")file_types=values.get("file_types")iffile_types:ifvalues.get("document_ids")orvalues.get("file_ids"):raiseValueError("file_types can only be given when folder_id is given,"" (not when document_ids or file_ids are given).")type_mapping={"document":"application/vnd.google-apps.document","sheet":"application/vnd.google-apps.spreadsheet","pdf":"application/pdf",}allowed_types=list(type_mapping.keys())+list(type_mapping.values())short_names=", ".join([f"'{x}'"forxintype_mapping.keys()])full_names=", ".join([f"'{x}'"forxintype_mapping.values()])forfile_typeinfile_types:iffile_typenotinallowed_types:raiseValueError(f"Given file type {file_type} is not supported. "f"Supported values are: {short_names}; and "f"their full-form names: {full_names}")# replace short-form file types by full-form file typesdeffull_form(x:str)->str:returntype_mapping[x]ifxintype_mappingelsexvalues["file_types"]=[full_form(file_type)forfile_typeinfile_types]returnvalues
[docs]@validator("credentials_path")defvalidate_credentials_path(cls,v:Any,**kwargs:Any)->Any:"""Validate that credentials_path exists."""ifnotv.exists():raiseValueError(f"credentials_path {v} does not exist")returnv
def_load_credentials(self)->Any:"""Load credentials. The order of loading credentials: 1. Service account key if file exists 2. Token path (for OAuth Client) if file exists 3. Credentials path (for OAuth Client) if file exists 4. Default credentials. if no credentials found, raise DefaultCredentialsError """# Adapted from https://developers.google.com/drive/api/v3/quickstart/pythontry:fromgoogle.authimportdefaultfromgoogle.auth.transport.requestsimportRequestfromgoogle.oauth2importservice_accountfromgoogle.oauth2.credentialsimportCredentialsfromgoogle_auth_oauthlib.flowimportInstalledAppFlowexceptImportError:raiseImportError("You must run ""`pip install --upgrade ""google-api-python-client google-auth-httplib2 ""google-auth-oauthlib` ""to use the Google Drive loader.")creds=None# From service accountifself.service_account_key.exists():returnservice_account.Credentials.from_service_account_file(str(self.service_account_key),scopes=SCOPES)# From Oauth Clientifself.token_path.exists():creds=Credentials.from_authorized_user_file(str(self.token_path),SCOPES)ifnotcredsornotcreds.valid:ifcredsandcreds.expiredandcreds.refresh_token:creds.refresh(Request())elifself.credentials_path.exists():flow=InstalledAppFlow.from_client_secrets_file(str(self.credentials_path),SCOPES)creds=flow.run_local_server(port=0)ifcreds:withopen(self.token_path,"w")astoken:token.write(creds.to_json())# From Application Default Credentialsifnotcreds:creds,_=default(scopes=SCOPES)returncredsdef_load_sheet_from_id(self,id:str)->List[Document]:"""Load a sheet and all tabs from an ID."""fromgoogleapiclient.discoveryimportbuildcreds=self._load_credentials()sheets_service=build("sheets","v4",credentials=creds)spreadsheet=sheets_service.spreadsheets().get(spreadsheetId=id).execute()sheets=spreadsheet.get("sheets",[])documents=[]forsheetinsheets:sheet_name=sheet["properties"]["title"]result=(sheets_service.spreadsheets().values().get(spreadsheetId=id,range=sheet_name).execute())values=result.get("values",[])ifnotvalues:continue# empty sheetheader=values[0]fori,rowinenumerate(values[1:],start=1):metadata={"source":(f"https://docs.google.com/spreadsheets/d/{id}/"f"edit?gid={sheet['properties']['sheetId']}"),"title":f"{spreadsheet['properties']['title']} - {sheet_name}","row":i,}content=[]forj,vinenumerate(row):title=header[j].strip()iflen(header)>jelse""content.append(f"{title}: {v.strip()}")page_content="\n".join(content)documents.append(Document(page_content=page_content,metadata=metadata))returndocumentsdef_load_document_from_id(self,id:str)->Document:"""Load a document from an ID."""fromioimportBytesIOfromgoogleapiclient.discoveryimportbuildfromgoogleapiclient.errorsimportHttpErrorfromgoogleapiclient.httpimportMediaIoBaseDownloadcreds=self._load_credentials()service=build("drive","v3",credentials=creds)file=(service.files().get(fileId=id,supportsAllDrives=True,fields="modifiedTime,name").execute())request=service.files().export_media(fileId=id,mimeType="text/plain")fh=BytesIO()downloader=MediaIoBaseDownload(fh,request)done=Falsetry:whiledoneisFalse:status,done=downloader.next_chunk()exceptHttpErrorase:ife.resp.status==404:print("File not found: {}".format(id))# noqa: T201else:print("An error occurred: {}".format(e))# noqa: T201text=fh.getvalue().decode("utf-8")metadata={"source":f"https://docs.google.com/document/d/{id}/edit","title":f"{file.get('name')}","when":f"{file.get('modifiedTime')}",}returnDocument(page_content=text,metadata=metadata)def_load_documents_from_folder(self,folder_id:str,*,file_types:Optional[Sequence[str]]=None)->List[Document]:"""Load documents from a folder."""fromgoogleapiclient.discoveryimportbuildcreds=self._load_credentials()service=build("drive","v3",credentials=creds)files=self._fetch_files_recursive(service,folder_id)# If file types filter is provided, we'll filter by the file type.iffile_types:_files=[fforfinfilesiff["mimeType"]infile_types]# type: ignoreelse:_files=filesreturns=[]forfilein_files:iffile["trashed"]andnotself.load_trashed_files:continueeliffile["mimeType"]=="application/vnd.google-apps.document":returns.append(self._load_document_from_id(file["id"]))# type: ignoreeliffile["mimeType"]=="application/vnd.google-apps.spreadsheet":returns.extend(self._load_sheet_from_id(file["id"]))# type: ignoreelif(file["mimeType"]=="application/pdf"orself.file_loader_clsisnotNone):returns.extend(self._load_file_from_id(file["id"]))# type: ignoreelse:passreturnreturnsdef_fetch_files_recursive(self,service:Any,folder_id:str)->List[Dict[str,Union[str,List[str]]]]:"""Fetch all files and subfolders recursively."""results=(service.files().list(q=f"'{folder_id}' in parents",pageSize=1000,includeItemsFromAllDrives=True,supportsAllDrives=True,fields="nextPageToken, files(id, name, mimeType, parents, trashed)",).execute())files=results.get("files",[])returns=[]forfileinfiles:iffile["mimeType"]=="application/vnd.google-apps.folder":ifself.recursive:returns.extend(self._fetch_files_recursive(service,file["id"]))else:returns.append(file)returnreturnsdef_load_documents_from_ids(self)->List[Document]:"""Load documents from a list of IDs."""ifnotself.document_ids:raiseValueError("document_ids must be set")return[self._load_document_from_id(doc_id)fordoc_idinself.document_ids]def_load_file_from_id(self,id:str)->List[Document]:"""Load a file from an ID."""fromioimportBytesIOfromgoogleapiclient.discoveryimportbuildfromgoogleapiclient.httpimportMediaIoBaseDownloadcreds=self._load_credentials()service=build("drive","v3",credentials=creds)file=service.files().get(fileId=id,supportsAllDrives=True).execute()request=service.files().get_media(fileId=id)fh=BytesIO()downloader=MediaIoBaseDownload(fh,request)done=FalsewhiledoneisFalse:status,done=downloader.next_chunk()ifself.file_loader_clsisnotNone:fh.seek(0)loader=self.file_loader_cls(file=fh,**self.file_loader_kwargs)docs=loader.load()fordocindocs:doc.metadata["source"]=f"https://drive.google.com/file/d/{id}/view"if"title"notindoc.metadata:doc.metadata["title"]=f"{file.get('name')}"returndocselse:fromPyPDF2importPdfReadercontent=fh.getvalue()pdf_reader=PdfReader(BytesIO(content))return[Document(page_content=page.extract_text(),metadata={"source":f"https://drive.google.com/file/d/{id}/view","title":f"{file.get('name')}","page":i,},)fori,pageinenumerate(pdf_reader.pages)]def_load_file_from_ids(self)->List[Document]:"""Load files from a list of IDs."""ifnotself.file_ids:raiseValueError("file_ids must be set")docs=[]forfile_idinself.file_ids:docs.extend(self._load_file_from_id(file_id))returndocs