Skip to main content

Index a single PDF

from langchat import LangChat
from langchat.providers import OpenAI, Pinecone, Supabase

LangChat.load_env()

lc = LangChat(
    llm=OpenAI("gpt-4o-mini"),
    vector_db=Pinecone("my-index"),
    db=Supabase(),
)

result = lc.index("docs/user-manual.pdf")
print(f"Indexed {result['chunks_indexed']} chunks")

Index multiple files

result = lc.index([
    "content/faq.pdf",
    "content/pricing.md",
    "content/terms.txt",
    "content/products.csv",
])

print(f"Files processed: {result['files_processed']}")
print(f"Chunks indexed:  {result['chunks_indexed']}")
print(f"Chunks skipped:  {result['chunks_skipped']}")  # duplicates

if result.get("errors"):
    for err in result["errors"]:
        print(f"  Error: {err}")

Index a directory

# Index everything in the docs/ folder
result = lc.index("docs/")

Custom chunking

For long-form content (legal documents, books), larger chunks preserve more context:
result = lc.index(
    "docs/legal-contracts.pdf",
    chunk_size=2000,
    chunk_overlap=400,
)
For short Q&A content (FAQs, product specs), smaller chunks improve precision:
result = lc.index(
    "content/faq.md",
    chunk_size=500,
    chunk_overlap=50,
)

Using namespaces

Namespaces let a single Pinecone index serve multiple document collections:
# Index different content into separate namespaces
lc.index("products/", namespace="products")
lc.index("support/", namespace="support")
lc.index("policies/", namespace="policies")

print("All namespaces indexed")

Build a knowledge base script

A complete script to build a fresh knowledge base:
#!/usr/bin/env python
# build_knowledge_base.py

import sys
from pathlib import Path
from langchat import LangChat
from langchat.providers import OpenAI, Pinecone, Supabase

LangChat.load_env()

DOCS_PATH = "content/"
NAMESPACE = "main"

lc = LangChat(
    llm=OpenAI("gpt-4o-mini"),
    vector_db=Pinecone("my-index"),
    db=Supabase(),
)

print(f"Indexing {DOCS_PATH}...")

result = lc.index(
    DOCS_PATH,
    chunk_size=1000,
    chunk_overlap=200,
    namespace=NAMESPACE,
    prevent_duplicates=True,
)

print(f"✓ Done")
print(f"  Chunks indexed:   {result['chunks_indexed']}")
print(f"  Chunks skipped:   {result['chunks_skipped']}")
print(f"  Files processed:  {result['files_processed']}")

if result.get("errors"):
    print(f"\n  Errors ({len(result['errors'])}):")
    for err in result["errors"]:
        print(f"    {err}")
    sys.exit(1)
Run with:
python build_knowledge_base.py

Watch for changes (polling)

For development, re-index when files change:
import time
import hashlib
from pathlib import Path

def file_hash(path: str) -> str:
    return hashlib.md5(Path(path).read_bytes()).hexdigest()

# Track file hashes
hashes = {}
docs_path = "content/"

while True:
    changed = []
    for f in Path(docs_path).rglob("*"):
        if f.is_file():
            h = file_hash(str(f))
            if hashes.get(str(f)) != h:
                hashes[str(f)] = h
                changed.append(str(f))

    if changed:
        print(f"Detected changes in {len(changed)} file(s), re-indexing...")
        result = lc.index(changed, prevent_duplicates=False)
        print(f"  Re-indexed {result['chunks_indexed']} chunks")

    time.sleep(30)  # check every 30 seconds