import pandas as pd from langchain.docstore.document import Document from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores.faiss import FAISS # FAQ読込 df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/recochoku/recochoku_faq.csv", usecols=[0,1,2,3], header=None) # 改行を削除 df = df.replace("\n", "", regex=True) df = df.replace("\r", "", regex=True) # 全角スペースを削除 df = df.replace("\u3000", "", regex=True) questions = range(0, 147) sources = [] # どのFAQを参考に回答を生成したのか追跡するための設定 for q in questions: sources.append( Document( page_content=df.iat[q, 1], metadata={"source": f"Q{q+1}: {df.iat[q, 2]} > {df.iat[q, 3]} > {df.iat[q, 0]}"} ) ) # チャンク分割 source_chunks = [] splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) for source in sources: for chunk in splitter.split_text(source.page_content): source_chunks.append(Document(page_content=chunk, metadata=source.metadata)) # 埋め込み表現を作成 search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings()) search_index.save_local("/content/gdrive/MyDrive/Colab Notebooks/recochoku/index")