Utilizing Generative AI for Question and Answering in Python Through the Langchain Framework
Step 1: Install required Libraries
langchain
pandas
numpy
tiktoken
psycopg2
pgvector
python-dotenv
sentence_transformers
openai==0.28.1
psycopg2-binary
pypdf
unstructured
unstructured[local-inference]
Step 2: Import Required Libraries
import pandas as pd
import numpy as np
from langchain.vectorstores.pgvector import PGVector
import tiktoken
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import DistanceStrategy
from langchain.schema import Document
Step 3: Vector DB - pgvector details
CONNECTION_STRING = f”postgresql+psycopg2://<dbname>:<user>@<server>:<port>/pgvector”
Step 4: LLM API Key
OPENAI_API_KEY = ‘xxxxxxxxxxxxxxxxxxx’
Step 5: Extract data from CSV
df = pd.read_csv(‘QA_set.csv’,encoding=”ISO-8859–1")
df.head()
Step 6: Write function to calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = “cl100k_base”) -> int:
if not string:
return 0
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
Step 7: Split the text into chunks of 512 tokens
text_splitter = TokenTextSplitter(chunk_size=512,chunk_overlap=103)
#list for smaller chunked text and metadata
new_list = []
# Create a new list by splitting up text into token sizes of around 512 tokens
for i in range(len(df.index)):
text = df[‘Question’][i]
token_len = num_tokens_from_string(text)
if token_len <= 512:
new_list.append([df[‘Title’][i], df[‘Question’][i], df[‘Answer’][i]])
else:
#split text into 512 token chunks using text splitter
split_text = text_splitter.split_text(text)
for j in range(len(split_text)):
new_list.append([df[‘Title’][i], split_text[j], df[‘Answer’][i]])
df_new = pd.DataFrame(new_list, columns=[‘Title’, ‘Question’, ‘Answer’])
df_new.head()
Step 8: Embeddings
loader = DataFrameLoader(df_new, page_content_column = ‘Question’)
docs = loader.load()
embeddings = HuggingFaceEmbeddings(
model_name=”sentence-transformers/all-MiniLM-L6-v2",
)
Step 9: Create a PGVector instance to store the documents and embeddings
db = PGVector.from_documents(
documents= docs,
embedding = embeddings,
collection_name= “QA_set”,
distance_strategy = DistanceStrategy.COSINE,
connection_string=CONNECTION_STRING)
Step 10: User Query and find Semantic Search
query = “How do elephants communicate over long distances?”
#Fetch the k=3 most similar documents
docs = db.similarity_search(query, k=3)
Step 11: User Query Response
# Interact with a document returned from the similarity search on pgvector
doc = docs[0]
# Access the document’s content
doc_content = doc.page_content
# Access the document’s metadata object
doc_metadata = doc.metadata
print(“Content snippet:” + doc_content[:500])
print(“Document title: “ + doc_metadata[‘Title’])
print(“Document Answer: “ + doc_metadata[‘Answer’])