Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active December 31, 2023 16:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rjurney/528ee90c54beb8ac8c784be62c98764a to your computer and use it in GitHub Desktop.
Save rjurney/528ee90c54beb8ac8c784be62c98764a to your computer and use it in GitHub Desktop.
Still trying to do RAG Q&A on all my academic papers to do RAG... Chroma couldn't ingest 900 PDFs. I bet OpenSearch can...
version: "3.8"
services:
opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/)
image: opensearchproject/opensearch:latest # Specifying the latest available image - modify if you want a specific version
container_name: opensearch-node1
environment:
- cluster.name=opensearch-cluster # Name the cluster
- node.name=opensearch-node1 # Name the node that will run in this container
- discovery.seed_hosts=opensearch-node1,opensearch-node2 # Nodes to look for when discovering the cluster
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligible to serve as cluster manager
- bootstrap.memory_lock=true # Disable JVM heap memory swapping
- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" # Set min and max JVM heap sizes to at least 50% of system RAM
ulimits:
memlock:
soft: -1 # Set memlock to unlimited (no soft or hard limit)
hard: -1
nofile:
soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536
hard: 65536
volumes:
- opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
ports:
- 9200:9200 # REST API
- 9600:9600 # Performance Analyzer
networks:
- chatbot-net # All of the containers will join the same Docker bridge network
opensearch-node2:
image: opensearchproject/opensearch:latest # This should be the same image used for opensearch-node1 to avoid issues
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node2
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true
- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data2:/usr/share/opensearch/data
networks:
- chatbot-net
opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:latest # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes
container_name: opensearch-dashboards
ports:
- 5601:5601 # Map host port 5601 to container port 5601
expose:
- "5601" # Expose port 5601 for web access to OpenSearch Dashboards
environment:
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
networks:
- chatbot-net
volumes:
opensearch-data1:
opensearch-data2:
networks:
chatbot-net:
driver: bridge
name: rag
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2023.08.22=h06a4308_0
- ld_impl_linux-64=2.38=h1181459_1
- libffi=3.4.4=h6a678d5_0
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.41.5=h5eee18b_0
- ncurses=6.4=h6a678d5_0
- openssl=3.0.11=h7f8727e_2
- pip=23.2.1=py310h06a4308_0
- python=3.10.13=h955ad1f_0
- readline=8.2=h5eee18b_0
- sqlite=3.41.2=h5eee18b_0
- tk=8.6.12=h1ccaba5_0
- wheel=0.38.4=py310h06a4308_0
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- aiohttp==3.8.5
- aiosignal==1.3.1
- annotated-types==0.6.0
- anyio==3.7.1
- appdirs==1.4.4
- asttokens==2.4.0
- async-timeout==4.0.3
- attrs==23.1.0
- aws-cdk-asset-awscli-v1==2.2.200
- aws-cdk-asset-kubectl-v20==2.1.2
- aws-cdk-asset-node-proxy-agent-v6==2.0.1
- aws-cdk-lib==2.100.0
- backcall==0.2.0
- backoff==2.2.1
- bcrypt==4.0.1
- beautifulsoup4==4.12.2
- black==23.9.1
- cattrs==23.1.2
- certifi==2023.7.22
- cfgv==3.4.0
- charset-normalizer==3.3.0
- chatbot-class==0.1.0
- chroma-hnswlib==0.7.3
- chromadb==0.4.13
- click==8.1.7
- coloredlogs==15.0.1
- constructs==10.2.70
- dataclasses-json==0.6.1
- decorator==5.1.1
- distlib==0.3.7
- docker-pycreds==0.4.0
- exceptiongroup==1.1.3
- executing==2.0.0
- fastapi==0.103.2
- filelock==3.12.4
- flake8==6.1.0
- flatbuffers==23.5.26
- frozenlist==1.4.0
- fsspec==2023.9.2
- gitdb==4.0.10
- gitpython==3.1.37
- greenlet==3.0.0
- h11==0.14.0
- httptools==0.6.0
- huggingface-hub==0.17.3
- humanfriendly==10.0
- identify==2.5.30
- idna==3.4
- importlib-resources==6.1.0
- ipython==8.16.1
- isort==5.12.0
- jedi==0.19.1
- jinja2==3.1.2
- joblib==1.3.2
- jsii==1.90.0
- jsonpatch==1.33
- jsonpointer==2.4
- langchain==0.0.301
- langchain-decorators==0.2.3
- langchainhub==0.1.13
- langsmith==0.0.43
- llama-index==0.8.36
- marko==2.0.0
- markupsafe==2.1.3
- marshmallow==3.20.1
- matplotlib-inline==0.1.6
- mccabe==0.7.0
- monotonic==1.6
- mpmath==1.3.0
- multidict==6.0.4
- mypy==1.5.1
- mypy-extensions==1.0.0
- nest-asyncio==1.5.8
- networkx==3.1
- nltk==3.8.1
- nodeenv==1.8.0
- numexpr==2.8.7
- numpy==1.25.2
- onnxruntime==1.16.0
- openai==0.28.1
- overrides==7.4.0
- packaging==23.2
- pandas==2.1.1
- parso==0.8.3
- pathspec==0.11.2
- pathtools==0.1.2
- pexpect==4.8.0
- pickleshare==0.7.5
- platformdirs==3.11.0
- posthog==3.0.2
- pre-commit==3.4.0
- prompt-toolkit==3.0.39
- promptwatch==0.3.0
- protobuf==4.24.4
- psutil==5.9.5
- ptyprocess==0.7.0
- publication==0.0.3
- pulsar-client==3.3.0
- pure-eval==0.2.2
- pycodestyle==2.11.0
- pydantic==2.4.2
- pydantic-core==2.10.1
- pyflakes==3.1.0
- pygments==2.16.1
- pypdf==3.16.2
- pypika==0.48.9
- python-dateutil==2.8.2
- python-dotenv==1.0.0
- pytz==2023.3.post1
- pyyaml==6.0.1
- regex==2023.10.3
- requests==2.31.0
- sentry-sdk==1.31.0
- setproctitle==1.3.3
- setuptools==68.2.2
- six==1.16.0
- smmap==5.0.1
- sniffio==1.3.0
- soupsieve==2.5
- sqlalchemy==2.0.21
- stack-data==0.6.3
- starlette==0.27.0
- sympy==1.12
- tenacity==8.2.3
- tiktoken==0.5.1
- tokenizers==0.14.1
- tomli==2.0.1
- torch==2.1.0
- tqdm==4.66.1
- traitlets==5.11.2
- typeguard==2.13.3
- typer==0.9.0
- types-requests==2.31.0.6
- types-urllib3==1.26.25.14
- typing-extensions==4.8.0
- typing-inspect==0.9.0
- tzdata==2023.3
- urllib3==1.26.17
- uvicorn==0.23.2
- uvloop==0.17.0
- virtualenv==20.24.5
- wandb==0.15.12
- watchfiles==0.20.0
- wcwidth==0.2.8
- websockets==11.0.3
- yarl==1.9.2
import logging
import os
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import CacheBackedEmbeddings, OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.storage import LocalFileStore
from langchain.vectorstores import OpenSearchVectorSearch
logging.getLogger("langchain").setLevel(logging.DEBUG)
# Dropbox folder with academic papers
PAPER_FOLDER = "/home/rjurney/Dropbox/Academic Papers/"
assert os.path.exists(PAPER_FOLDER)
# Set in my ~/.zshrc
openai_api_key = os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Load all PDFs from academic paper folder
loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
docs = loader.load()
# How many papers on network motifs?
motif_docs = [doc for doc in docs if "motif" in doc.page_content]
motif_doc_count = len(motif_docs)
paper_count = len(set(doc.metadata["source"] for doc in motif_docs))
print(
f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
)
# Embed them with OpenAI ada model and store them in OpenSearch
embeddings = OpenAIEmbeddings()
fs = LocalFileStore("./data/embedding_cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embeddings, fs, namespace=embeddings.model)
# Setup a OpenSearch to store the embeddings
opensearch = OpenSearchVectorSearch(
index_name="academic_papers",
embedding_function=cached_embedder,
opensearch_url="http://localhost:9200",
)
# Setup a simple buffer memory system to submit with the API calls to provide prompt context
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
qa = ConversationalRetrievalChain.from_llm(
OpenAI(temperature=0.8),
opensearch.as_retriever(),
memory=memory,
verbose=True,
)
result = qa({"question": "What are the different types of network motif?"})
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment