! pip install --upgrade google-cloud-aiplatform
# LangChain
! pip install langchain langchain-experimental langchain[docarray]
! pip install pypdf
! pip install pydantic==1.10.8
# Open source vector store
! pip install chromadb==0.3.26
! pip install typing-inspect==0.8.0 typing_extensions==4.5.0
# For dense vector representations of text
! pip install sentence-transformers
7 Agents and vectorstores
In this notebook, we will explore one of the most fun features of LangChain: agents and their toolkits.
Agents have access to tools such as JSON, Wikipedia, Web Search, GitHub or Pandas Dataframes, and can access their capabilities depending on user input.
See here for a full list of agent toolkits.
We will use the following technologies:
Vertex AI Generative Studio
Langchain, a framework for building applications with large language models
The open-source Chroma vector store database
7.1 Data Retrieval with LLMs and Embeddings
Matching customer queries to products via embeddings and Retrieval Augmentated Generation.
7.1.1 Overview
This notebook demonstrates one method of using large language models to interact with data. Using the Wayfair WANDS dataset of more than 42,000 products, we will go through the following steps:
Download the data into a pandas dataframe and take a smaller 1,000-row sample set
Merge then generate embeddings for the product titles and descriptions
Prompt an LLM to retrieve details and relevant documents related to queries.
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython
= IPython.Application.instance()
app True) app.kernel.do_shutdown(
from google.colab import auth
auth.authenticate_user()
7.1.2 SDK and Project Initialization
#Fill in your GCP project_id and region
= "<..>"
PROJECT_ID = "<..>"
REGION
import vertexai
=PROJECT_ID, location=REGION) vertexai.init(project
7.1.3 Import Langchain tools
# Utils
import time
from typing import List
# Langchain
import langchain
from pydantic import BaseModel
print(f"LangChain version: {langchain.__version__}")
# Vertex AI
from google.cloud import aiplatform
from langchain.chat_models import ChatVertexAI
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.schema import HumanMessage, SystemMessage
print(f"Vertex AI SDK version: {aiplatform.__version__}")
8 Import data
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/product.csv
import pandas as pd
= pd.read_csv("product.csv", sep='\t') product_df
= product_df[:1000].dropna() product_df
len(product_df)
# Reduce the df to columns of interest
= product_df.filter(["product_id", "product_name", "product_description", "average_rating"], axis=1) product_df
product_df.head()
8.0.1 Import and initialize pandas dataframe agent
These tools use the langchain-experimental
pip package installed at the start of the notebook.
8.0.2 Pandas agent
This agent allows us to interact with the dataframe using natural language. LangChain shows us the pandas queries it is composing to answer the questions.
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
= create_pandas_dataframe_agent(VertexAI(temperature=0), product_df, verbose=True) agent
= create_pandas_dataframe_agent(VertexAI(temperature=0), product_df, verbose=True) agent
"how many rows are there?") agent.run(
"How many beds are there with a rating of > 4?") agent.run(
8.0.3 CSV agent
We can also work directly on a .csv file
"data.csv") pd.DataFrame.to_csv(product_df,
from langchain_experimental.agents.agent_toolkits import create_csv_agent
= create_csv_agent(
agent =0),
VertexAI(temperature"data.csv",
=True,
verbose=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
agent_type )
"How many rows are there?") agent.run(
"Do any products descriptions mention polypropylene pile? Output them as JSON please") agent.run(
"What is the square root of all ratings for product names featuring sofas") agent.run(
8.1 Vector stores
We will explore embeddings vectors and vector stores in more detail in the subsequent notebooks. Let’s see what’s possible by concatenating our product_title
and product_description
columns and creating a text file from the result. We can then create embeddings and perform various retrieval and Q&A tasks.
We will use the open source Chroma vector store.
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
'text_data'] = product_df['product_name'] + " " + product_df['product_description'] product_df[
# Save the "text_data" column to a text file
= "combined_text_data.txt"
text_file_path 'text_data'].to_csv(text_file_path, sep='\t', index=False, header=False) product_df[
# load the document and split it into chunks
= TextLoader("combined_text_data.txt")
loader = loader.load() documents
8.1.1 Text splitter
Splitting text is common when working with LangChain and LLMs in general. This practice means we can feed large amounts of data to LLMs for parsing or embedding in chunks, or batches.
Ideally, we want to do so in a way that keeps meaningful chunks together. We will use the default recommended RecursiveCharacterTextSplitter
. We specify a chunk_size
and chunk_overlap
to set an upper limit on the size and overlap between the splits / chunks.
from langchain.text_splitter import RecursiveCharacterTextSplitter
= RecursiveCharacterTextSplitter(
text_splitter = 1500,
chunk_size = 150
chunk_overlap
)
= text_splitter.split_documents(documents) docs
len(docs)
from langchain.vectorstores import Chroma
# Clear any previous vector store
!rm -rf ./docs/chroma
= SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_function = Chroma.from_documents(docs, embedding_function) db
= "Is there a slow cooker?"
query = db.similarity_search(query, n_results=2) docs
0] docs[
= "Recommend a durable door mat"
query = db.similarity_search(query, n_results=2) docs
docs
8.1.2 Retrieval
A Retriever
is a method for answering questions based on information in an index.
Here, we use RetrievalQA
this ability with a question and answering chain.
from langchain.chains import RetrievalQA
= VertexAI(
llm ="text-bison@001",
model_name=1024,
max_output_tokens=0.1,
temperature=0.8,
top_p=40,
top_k=True,
verbose
)
= RetrievalQA.from_chain_type(
qa_chain
llm,=db.as_retriever()
retriever )
8.1.3 Prompt
from langchain.prompts import PromptTemplate
# Build prompt
= """Use the following pieces of context to answer the question at the end. \
template If you don't know the answer, just say that you don't know, \
don't try to make up an answer. Use three sentences maximum. \
{context}
Question: {question}
Helpful Answer:"""
= PromptTemplate(input_variables=["context", "question"],template=template,) QA_CHAIN_PROMPT
# Run chain
= RetrievalQA.from_chain_type(
qa_chain
llm,=db.as_retriever(),
retriever=True,
return_source_documents={"prompt": QA_CHAIN_PROMPT}
chain_type_kwargs )
= "Can you recommend comfortable bed sheets?"
question = qa_chain({"query": question})
result "result"] result[