Last active
December 31, 2023 16:56
-
-
Save rjurney/528ee90c54beb8ac8c784be62c98764a to your computer and use it in GitHub Desktop.
Still trying to do RAG Q&A on all my academic papers to do RAG... Chroma couldn't ingest 900 PDFs. I bet OpenSearch can...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version: "3.8" | |
services: | |
opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/) | |
image: opensearchproject/opensearch:latest # Specifying the latest available image - modify if you want a specific version | |
container_name: opensearch-node1 | |
environment: | |
- cluster.name=opensearch-cluster # Name the cluster | |
- node.name=opensearch-node1 # Name the node that will run in this container | |
- discovery.seed_hosts=opensearch-node1,opensearch-node2 # Nodes to look for when discovering the cluster | |
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligible to serve as cluster manager | |
- bootstrap.memory_lock=true # Disable JVM heap memory swapping | |
- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" # Set min and max JVM heap sizes to at least 50% of system RAM | |
ulimits: | |
memlock: | |
soft: -1 # Set memlock to unlimited (no soft or hard limit) | |
hard: -1 | |
nofile: | |
soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536 | |
hard: 65536 | |
volumes: | |
- opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container | |
ports: | |
- 9200:9200 # REST API | |
- 9600:9600 # Performance Analyzer | |
networks: | |
- chatbot-net # All of the containers will join the same Docker bridge network | |
opensearch-node2: | |
image: opensearchproject/opensearch:latest # This should be the same image used for opensearch-node1 to avoid issues | |
container_name: opensearch-node2 | |
environment: | |
- cluster.name=opensearch-cluster | |
- node.name=opensearch-node2 | |
- discovery.seed_hosts=opensearch-node1,opensearch-node2 | |
- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 | |
- bootstrap.memory_lock=true | |
- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" | |
ulimits: | |
memlock: | |
soft: -1 | |
hard: -1 | |
nofile: | |
soft: 65536 | |
hard: 65536 | |
volumes: | |
- opensearch-data2:/usr/share/opensearch/data | |
networks: | |
- chatbot-net | |
opensearch-dashboards: | |
image: opensearchproject/opensearch-dashboards:latest # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes | |
container_name: opensearch-dashboards | |
ports: | |
- 5601:5601 # Map host port 5601 to container port 5601 | |
expose: | |
- "5601" # Expose port 5601 for web access to OpenSearch Dashboards | |
environment: | |
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query | |
networks: | |
- chatbot-net | |
volumes: | |
opensearch-data1: | |
opensearch-data2: | |
networks: | |
chatbot-net: | |
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: rag | |
channels: | |
- defaults | |
dependencies: | |
- _libgcc_mutex=0.1=main | |
- _openmp_mutex=5.1=1_gnu | |
- bzip2=1.0.8=h7b6447c_0 | |
- ca-certificates=2023.08.22=h06a4308_0 | |
- ld_impl_linux-64=2.38=h1181459_1 | |
- libffi=3.4.4=h6a678d5_0 | |
- libgcc-ng=11.2.0=h1234567_1 | |
- libgomp=11.2.0=h1234567_1 | |
- libstdcxx-ng=11.2.0=h1234567_1 | |
- libuuid=1.41.5=h5eee18b_0 | |
- ncurses=6.4=h6a678d5_0 | |
- openssl=3.0.11=h7f8727e_2 | |
- pip=23.2.1=py310h06a4308_0 | |
- python=3.10.13=h955ad1f_0 | |
- readline=8.2=h5eee18b_0 | |
- sqlite=3.41.2=h5eee18b_0 | |
- tk=8.6.12=h1ccaba5_0 | |
- wheel=0.38.4=py310h06a4308_0 | |
- xz=5.4.2=h5eee18b_0 | |
- zlib=1.2.13=h5eee18b_0 | |
- pip: | |
- aiohttp==3.8.5 | |
- aiosignal==1.3.1 | |
- annotated-types==0.6.0 | |
- anyio==3.7.1 | |
- appdirs==1.4.4 | |
- asttokens==2.4.0 | |
- async-timeout==4.0.3 | |
- attrs==23.1.0 | |
- aws-cdk-asset-awscli-v1==2.2.200 | |
- aws-cdk-asset-kubectl-v20==2.1.2 | |
- aws-cdk-asset-node-proxy-agent-v6==2.0.1 | |
- aws-cdk-lib==2.100.0 | |
- backcall==0.2.0 | |
- backoff==2.2.1 | |
- bcrypt==4.0.1 | |
- beautifulsoup4==4.12.2 | |
- black==23.9.1 | |
- cattrs==23.1.2 | |
- certifi==2023.7.22 | |
- cfgv==3.4.0 | |
- charset-normalizer==3.3.0 | |
- chatbot-class==0.1.0 | |
- chroma-hnswlib==0.7.3 | |
- chromadb==0.4.13 | |
- click==8.1.7 | |
- coloredlogs==15.0.1 | |
- constructs==10.2.70 | |
- dataclasses-json==0.6.1 | |
- decorator==5.1.1 | |
- distlib==0.3.7 | |
- docker-pycreds==0.4.0 | |
- exceptiongroup==1.1.3 | |
- executing==2.0.0 | |
- fastapi==0.103.2 | |
- filelock==3.12.4 | |
- flake8==6.1.0 | |
- flatbuffers==23.5.26 | |
- frozenlist==1.4.0 | |
- fsspec==2023.9.2 | |
- gitdb==4.0.10 | |
- gitpython==3.1.37 | |
- greenlet==3.0.0 | |
- h11==0.14.0 | |
- httptools==0.6.0 | |
- huggingface-hub==0.17.3 | |
- humanfriendly==10.0 | |
- identify==2.5.30 | |
- idna==3.4 | |
- importlib-resources==6.1.0 | |
- ipython==8.16.1 | |
- isort==5.12.0 | |
- jedi==0.19.1 | |
- jinja2==3.1.2 | |
- joblib==1.3.2 | |
- jsii==1.90.0 | |
- jsonpatch==1.33 | |
- jsonpointer==2.4 | |
- langchain==0.0.301 | |
- langchain-decorators==0.2.3 | |
- langchainhub==0.1.13 | |
- langsmith==0.0.43 | |
- llama-index==0.8.36 | |
- marko==2.0.0 | |
- markupsafe==2.1.3 | |
- marshmallow==3.20.1 | |
- matplotlib-inline==0.1.6 | |
- mccabe==0.7.0 | |
- monotonic==1.6 | |
- mpmath==1.3.0 | |
- multidict==6.0.4 | |
- mypy==1.5.1 | |
- mypy-extensions==1.0.0 | |
- nest-asyncio==1.5.8 | |
- networkx==3.1 | |
- nltk==3.8.1 | |
- nodeenv==1.8.0 | |
- numexpr==2.8.7 | |
- numpy==1.25.2 | |
- onnxruntime==1.16.0 | |
- openai==0.28.1 | |
- overrides==7.4.0 | |
- packaging==23.2 | |
- pandas==2.1.1 | |
- parso==0.8.3 | |
- pathspec==0.11.2 | |
- pathtools==0.1.2 | |
- pexpect==4.8.0 | |
- pickleshare==0.7.5 | |
- platformdirs==3.11.0 | |
- posthog==3.0.2 | |
- pre-commit==3.4.0 | |
- prompt-toolkit==3.0.39 | |
- promptwatch==0.3.0 | |
- protobuf==4.24.4 | |
- psutil==5.9.5 | |
- ptyprocess==0.7.0 | |
- publication==0.0.3 | |
- pulsar-client==3.3.0 | |
- pure-eval==0.2.2 | |
- pycodestyle==2.11.0 | |
- pydantic==2.4.2 | |
- pydantic-core==2.10.1 | |
- pyflakes==3.1.0 | |
- pygments==2.16.1 | |
- pypdf==3.16.2 | |
- pypika==0.48.9 | |
- python-dateutil==2.8.2 | |
- python-dotenv==1.0.0 | |
- pytz==2023.3.post1 | |
- pyyaml==6.0.1 | |
- regex==2023.10.3 | |
- requests==2.31.0 | |
- sentry-sdk==1.31.0 | |
- setproctitle==1.3.3 | |
- setuptools==68.2.2 | |
- six==1.16.0 | |
- smmap==5.0.1 | |
- sniffio==1.3.0 | |
- soupsieve==2.5 | |
- sqlalchemy==2.0.21 | |
- stack-data==0.6.3 | |
- starlette==0.27.0 | |
- sympy==1.12 | |
- tenacity==8.2.3 | |
- tiktoken==0.5.1 | |
- tokenizers==0.14.1 | |
- tomli==2.0.1 | |
- torch==2.1.0 | |
- tqdm==4.66.1 | |
- traitlets==5.11.2 | |
- typeguard==2.13.3 | |
- typer==0.9.0 | |
- types-requests==2.31.0.6 | |
- types-urllib3==1.26.25.14 | |
- typing-extensions==4.8.0 | |
- typing-inspect==0.9.0 | |
- tzdata==2023.3 | |
- urllib3==1.26.17 | |
- uvicorn==0.23.2 | |
- uvloop==0.17.0 | |
- virtualenv==20.24.5 | |
- wandb==0.15.12 | |
- watchfiles==0.20.0 | |
- wcwidth==0.2.8 | |
- websockets==11.0.3 | |
- yarl==1.9.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.embeddings import CacheBackedEmbeddings, OpenAIEmbeddings | |
from langchain.llms import OpenAI | |
from langchain.memory import ConversationBufferMemory | |
from langchain.storage import LocalFileStore | |
from langchain.vectorstores import OpenSearchVectorSearch | |
logging.getLogger("langchain").setLevel(logging.DEBUG) | |
# Dropbox folder with academic papers | |
PAPER_FOLDER = "/home/rjurney/Dropbox/Academic Papers/" | |
assert os.path.exists(PAPER_FOLDER) | |
# Set in my ~/.zshrc | |
openai_api_key = os.environ.get("OPENAI_API_KEY") | |
if not openai_api_key: | |
raise ValueError("OPENAI_API_KEY environment variable not set") | |
# Load all PDFs from academic paper folder | |
loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True) | |
docs = loader.load() | |
# How many papers on network motifs? | |
motif_docs = [doc for doc in docs if "motif" in doc.page_content] | |
motif_doc_count = len(motif_docs) | |
paper_count = len(set(doc.metadata["source"] for doc in motif_docs)) | |
print( | |
f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`." | |
) | |
# Embed them with OpenAI ada model and store them in OpenSearch | |
embeddings = OpenAIEmbeddings() | |
fs = LocalFileStore("./data/embedding_cache/") | |
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embeddings, fs, namespace=embeddings.model) | |
# Setup a OpenSearch to store the embeddings | |
opensearch = OpenSearchVectorSearch( | |
index_name="academic_papers", | |
embedding_function=cached_embedder, | |
opensearch_url="http://localhost:9200", | |
) | |
# Setup a simple buffer memory system to submit with the API calls to provide prompt context | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system | |
qa = ConversationalRetrievalChain.from_llm( | |
OpenAI(temperature=0.8), | |
opensearch.as_retriever(), | |
memory=memory, | |
verbose=True, | |
) | |
result = qa({"question": "What are the different types of network motif?"}) | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment