Documentation Index
Fetch the complete documentation index at: https://langchat.neurobrains.co/llms.txt
Use this file to discover all available pages before exploring further.
Index a single PDF
from langchat import LangChat
from langchat.providers import OpenAI, Pinecone, Supabase
LangChat.load_env()
lc = LangChat(
llm=OpenAI("gpt-4o-mini"),
vector_db=Pinecone("my-index"),
db=Supabase(),
)
result = lc.index("docs/user-manual.pdf")
print(f"Indexed {result['chunks_indexed']} chunks")
Index multiple files
result = lc.index([
"content/faq.pdf",
"content/pricing.md",
"content/terms.txt",
"content/products.csv",
])
print(f"Files processed: {result['files_processed']}")
print(f"Chunks indexed: {result['chunks_indexed']}")
print(f"Chunks skipped: {result['chunks_skipped']}") # duplicates
if result.get("errors"):
for err in result["errors"]:
print(f" Error: {err}")
Index a directory
# Index everything in the docs/ folder
result = lc.index("docs/")
Custom chunking
For long-form content (legal documents, books), larger chunks preserve more context:
result = lc.index(
"docs/legal-contracts.pdf",
chunk_size=2000,
chunk_overlap=400,
)
For short Q&A content (FAQs, product specs), smaller chunks improve precision:
result = lc.index(
"content/faq.md",
chunk_size=500,
chunk_overlap=50,
)
Using namespaces
Namespaces let a single Pinecone index serve multiple document collections:
# Index different content into separate namespaces
lc.index("products/", namespace="products")
lc.index("support/", namespace="support")
lc.index("policies/", namespace="policies")
print("All namespaces indexed")
Build a knowledge base script
A complete script to build a fresh knowledge base:
#!/usr/bin/env python
# build_knowledge_base.py
import sys
from pathlib import Path
from langchat import LangChat
from langchat.providers import OpenAI, Pinecone, Supabase
LangChat.load_env()
DOCS_PATH = "content/"
NAMESPACE = "main"
lc = LangChat(
llm=OpenAI("gpt-4o-mini"),
vector_db=Pinecone("my-index"),
db=Supabase(),
)
print(f"Indexing {DOCS_PATH}...")
result = lc.index(
DOCS_PATH,
chunk_size=1000,
chunk_overlap=200,
namespace=NAMESPACE,
prevent_duplicates=True,
)
print(f"✓ Done")
print(f" Chunks indexed: {result['chunks_indexed']}")
print(f" Chunks skipped: {result['chunks_skipped']}")
print(f" Files processed: {result['files_processed']}")
if result.get("errors"):
print(f"\n Errors ({len(result['errors'])}):")
for err in result["errors"]:
print(f" {err}")
sys.exit(1)
Run with:
python build_knowledge_base.py
Watch for changes (polling)
For development, re-index when files change:
import time
import hashlib
from pathlib import Path
def file_hash(path: str) -> str:
return hashlib.md5(Path(path).read_bytes()).hexdigest()
# Track file hashes
hashes = {}
docs_path = "content/"
while True:
changed = []
for f in Path(docs_path).rglob("*"):
if f.is_file():
h = file_hash(str(f))
if hashes.get(str(f)) != h:
hashes[str(f)] = h
changed.append(str(f))
if changed:
print(f"Detected changes in {len(changed)} file(s), re-indexing...")
result = lc.index(changed, prevent_duplicates=False)
print(f" Re-indexed {result['chunks_indexed']} chunks")
time.sleep(30) # check every 30 seconds