rjurney · December 31, 2023 16:56
diff --git a/docker-compose.yml b/docker-compose.yml
 version: "3.8"

 services:

  opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/)
    image: opensearchproject/opensearch:latest # Specifying the latest available image - modify if you want a specific version
    container_name: opensearch-node1
    environment:
      - cluster.name=opensearch-cluster # Name the cluster
      - node.name=opensearch-node1 # Name the node that will run in this container
      - discovery.seed_hosts=opensearch-node1,opensearch-node2 # Nodes to look for when discovering the cluster
      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligible to serve as cluster manager
      - bootstrap.memory_lock=true # Disable JVM heap memory swapping
      - "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" # Set min and max JVM heap sizes to at least 50% of system RAM
    ulimits:
      memlock:
        soft: -1 # Set memlock to unlimited (no soft or hard limit)
        hard: -1
      nofile:
        soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536
        hard: 65536
    volumes:
      - opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
    ports:
      - 9200:9200 # REST API
      - 9600:9600 # Performance Analyzer
    networks:
      - chatbot-net # All of the containers will join the same Docker bridge network

  opensearch-node2:
    image: opensearchproject/opensearch:latest # This should be the same image used for opensearch-node1 to avoid issues
    container_name: opensearch-node2
    environment:
      - cluster.name=opensearch-cluster
      - node.name=opensearch-node2
      - discovery.seed_hosts=opensearch-node1,opensearch-node2
      - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
      - bootstrap.memory_lock=true
      - "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g"
    ulimits:
      memlock:
        soft: -1
        hard: -1
      nofile:
        soft: 65536
        hard: 65536
    volumes:
      - opensearch-data2:/usr/share/opensearch/data
    networks:
      - chatbot-net

  opensearch-dashboards:
    image: opensearchproject/opensearch-dashboards:latest # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes
    container_name: opensearch-dashboards
    ports:
      - 5601:5601 # Map host port 5601 to container port 5601
    expose:
      - "5601" # Expose port 5601 for web access to OpenSearch Dashboards
    environment:
      OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
    networks:
      - chatbot-net

 volumes:
  opensearch-data1:
  opensearch-data2:

 networks:
  chatbot-net:
    driver: bridge
diff --git a/environment.yml b/environment.yml
 name: rag
 channels:
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2023.08.22=h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.4.4=h6a678d5_0
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - libuuid=1.41.5=h5eee18b_0
  - ncurses=6.4=h6a678d5_0
  - openssl=3.0.11=h7f8727e_2
  - pip=23.2.1=py310h06a4308_0
  - python=3.10.13=h955ad1f_0
  - readline=8.2=h5eee18b_0
  - sqlite=3.41.2=h5eee18b_0
  - tk=8.6.12=h1ccaba5_0
  - wheel=0.38.4=py310h06a4308_0
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
      - aiohttp==3.8.5
      - aiosignal==1.3.1
      - annotated-types==0.6.0
      - anyio==3.7.1
      - appdirs==1.4.4
      - asttokens==2.4.0
      - async-timeout==4.0.3
      - attrs==23.1.0
      - aws-cdk-asset-awscli-v1==2.2.200
      - aws-cdk-asset-kubectl-v20==2.1.2
      - aws-cdk-asset-node-proxy-agent-v6==2.0.1
      - aws-cdk-lib==2.100.0
      - backcall==0.2.0
      - backoff==2.2.1
      - bcrypt==4.0.1
      - beautifulsoup4==4.12.2
      - black==23.9.1
      - cattrs==23.1.2
      - certifi==2023.7.22
      - cfgv==3.4.0
      - charset-normalizer==3.3.0
      - chatbot-class==0.1.0
      - chroma-hnswlib==0.7.3
      - chromadb==0.4.13
      - click==8.1.7
      - coloredlogs==15.0.1
      - constructs==10.2.70
      - dataclasses-json==0.6.1
      - decorator==5.1.1
      - distlib==0.3.7
      - docker-pycreds==0.4.0
      - exceptiongroup==1.1.3
      - executing==2.0.0
      - fastapi==0.103.2
      - filelock==3.12.4
      - flake8==6.1.0
      - flatbuffers==23.5.26
      - frozenlist==1.4.0
      - fsspec==2023.9.2
      - gitdb==4.0.10
      - gitpython==3.1.37
      - greenlet==3.0.0
      - h11==0.14.0
      - httptools==0.6.0
      - huggingface-hub==0.17.3
      - humanfriendly==10.0
      - identify==2.5.30
      - idna==3.4
      - importlib-resources==6.1.0
      - ipython==8.16.1
      - isort==5.12.0
      - jedi==0.19.1
      - jinja2==3.1.2
      - joblib==1.3.2
      - jsii==1.90.0
      - jsonpatch==1.33
      - jsonpointer==2.4
      - langchain==0.0.301
      - langchain-decorators==0.2.3
      - langchainhub==0.1.13
      - langsmith==0.0.43
      - llama-index==0.8.36
      - marko==2.0.0
      - markupsafe==2.1.3
      - marshmallow==3.20.1
      - matplotlib-inline==0.1.6
      - mccabe==0.7.0
      - monotonic==1.6
      - mpmath==1.3.0
      - multidict==6.0.4
      - mypy==1.5.1
      - mypy-extensions==1.0.0
      - nest-asyncio==1.5.8
      - networkx==3.1
      - nltk==3.8.1
      - nodeenv==1.8.0
      - numexpr==2.8.7
      - numpy==1.25.2
      - onnxruntime==1.16.0
      - openai==0.28.1
      - overrides==7.4.0
      - packaging==23.2
      - pandas==2.1.1
      - parso==0.8.3
      - pathspec==0.11.2
      - pathtools==0.1.2
      - pexpect==4.8.0
      - pickleshare==0.7.5
      - platformdirs==3.11.0
      - posthog==3.0.2
      - pre-commit==3.4.0
      - prompt-toolkit==3.0.39
      - promptwatch==0.3.0
      - protobuf==4.24.4
      - psutil==5.9.5
      - ptyprocess==0.7.0
      - publication==0.0.3
      - pulsar-client==3.3.0
      - pure-eval==0.2.2
      - pycodestyle==2.11.0
      - pydantic==2.4.2
      - pydantic-core==2.10.1
      - pyflakes==3.1.0
      - pygments==2.16.1
      - pypdf==3.16.2
      - pypika==0.48.9
      - python-dateutil==2.8.2
      - python-dotenv==1.0.0
      - pytz==2023.3.post1
      - pyyaml==6.0.1
      - regex==2023.10.3
      - requests==2.31.0
      - sentry-sdk==1.31.0
      - setproctitle==1.3.3
      - setuptools==68.2.2
      - six==1.16.0
      - smmap==5.0.1
      - sniffio==1.3.0
      - soupsieve==2.5
      - sqlalchemy==2.0.21
      - stack-data==0.6.3
      - starlette==0.27.0
      - sympy==1.12
      - tenacity==8.2.3
      - tiktoken==0.5.1
      - tokenizers==0.14.1
      - tomli==2.0.1
      - torch==2.1.0
      - tqdm==4.66.1
      - traitlets==5.11.2
      - typeguard==2.13.3
      - typer==0.9.0
      - types-requests==2.31.0.6
      - types-urllib3==1.26.25.14
      - typing-extensions==4.8.0
      - typing-inspect==0.9.0
      - tzdata==2023.3
      - urllib3==1.26.17
      - uvicorn==0.23.2
      - uvloop==0.17.0
      - virtualenv==20.24.5
      - wandb==0.15.12
      - watchfiles==0.20.0
      - wcwidth==0.2.8
      - websockets==11.0.3
      - yarl==1.9.2
diff --git a/latest_opensearch_rag.py b/latest_opensearch_rag.py
 import logging
 import os

 from langchain.chains import ConversationalRetrievalChain
 from langchain.document_loaders import PyPDFDirectoryLoader
 from langchain.embeddings import CacheBackedEmbeddings, OpenAIEmbeddings
 from langchain.llms import OpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.storage import LocalFileStore
 from langchain.vectorstores import OpenSearchVectorSearch

 logging.getLogger("langchain").setLevel(logging.DEBUG)

 # Dropbox folder with academic papers
 PAPER_FOLDER = "/home/rjurney/Dropbox/Academic Papers/"
 assert os.path.exists(PAPER_FOLDER)

 # Set in my ~/.zshrc
 openai_api_key = os.environ.get("OPENAI_API_KEY")
 if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set")

 # Load all PDFs from academic paper folder
 loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
 docs = loader.load()

 # How many papers on network motifs?
 motif_docs = [doc for doc in docs if "motif" in doc.page_content]
 motif_doc_count = len(motif_docs)
 paper_count = len(set(doc.metadata["source"] for doc in motif_docs))
 print(
    f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
 )

 # Embed them with OpenAI ada model and store them in OpenSearch
 embeddings = OpenAIEmbeddings()
 fs = LocalFileStore("./data/embedding_cache/")
 cached_embedder = CacheBackedEmbeddings.from_bytes_store(embeddings, fs, namespace=embeddings.model)

 # Setup a OpenSearch to store the embeddings
 opensearch = OpenSearchVectorSearch(
    index_name="academic_papers",
    embedding_function=cached_embedder,
    opensearch_url="http://localhost:9200",
 )

 # Setup a simple buffer memory system to submit with the API calls to provide prompt context
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

 # Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
 qa = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0.8),
    opensearch.as_retriever(),
    memory=memory,
    verbose=True,
 )

 result = qa({"question": "What are the different types of network motif?"})
 print(result)
	version: "3.8"

	services:

	opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/)
	image: opensearchproject/opensearch:latest # Specifying the latest available image - modify if you want a specific version
	container_name: opensearch-node1
	environment:
	- cluster.name=opensearch-cluster # Name the cluster
	- node.name=opensearch-node1 # Name the node that will run in this container
	- discovery.seed_hosts=opensearch-node1,opensearch-node2 # Nodes to look for when discovering the cluster
	- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligible to serve as cluster manager
	- bootstrap.memory_lock=true # Disable JVM heap memory swapping
	- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" # Set min and max JVM heap sizes to at least 50% of system RAM
	ulimits:
	memlock:
	soft: -1 # Set memlock to unlimited (no soft or hard limit)
	hard: -1
	nofile:
	soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536
	hard: 65536
	volumes:
	- opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
	ports:
	- 9200:9200 # REST API
	- 9600:9600 # Performance Analyzer
	networks:
	- chatbot-net # All of the containers will join the same Docker bridge network

	opensearch-node2:
	image: opensearchproject/opensearch:latest # This should be the same image used for opensearch-node1 to avoid issues
	container_name: opensearch-node2
	environment:
	- cluster.name=opensearch-cluster
	- node.name=opensearch-node2
	- discovery.seed_hosts=opensearch-node1,opensearch-node2
	- cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2
	- bootstrap.memory_lock=true
	- "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g"
	ulimits:
	memlock:
	soft: -1
	hard: -1
	nofile:
	soft: 65536
	hard: 65536
	volumes:
	- opensearch-data2:/usr/share/opensearch/data
	networks:
	- chatbot-net

	opensearch-dashboards:
	image: opensearchproject/opensearch-dashboards:latest # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes
	container_name: opensearch-dashboards
	ports:
	- 5601:5601 # Map host port 5601 to container port 5601
	expose:
	- "5601" # Expose port 5601 for web access to OpenSearch Dashboards
	environment:
	OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
	networks:
	- chatbot-net

	volumes:
	opensearch-data1:
	opensearch-data2:

	networks:
	chatbot-net:
	driver: bridge
	name: rag
	channels:
	- defaults
	dependencies:
	- _libgcc_mutex=0.1=main
	- _openmp_mutex=5.1=1_gnu
	- bzip2=1.0.8=h7b6447c_0
	- ca-certificates=2023.08.22=h06a4308_0
	- ld_impl_linux-64=2.38=h1181459_1
	- libffi=3.4.4=h6a678d5_0
	- libgcc-ng=11.2.0=h1234567_1
	- libgomp=11.2.0=h1234567_1
	- libstdcxx-ng=11.2.0=h1234567_1
	- libuuid=1.41.5=h5eee18b_0
	- ncurses=6.4=h6a678d5_0
	- openssl=3.0.11=h7f8727e_2
	- pip=23.2.1=py310h06a4308_0
	- python=3.10.13=h955ad1f_0
	- readline=8.2=h5eee18b_0
	- sqlite=3.41.2=h5eee18b_0
	- tk=8.6.12=h1ccaba5_0
	- wheel=0.38.4=py310h06a4308_0
	- xz=5.4.2=h5eee18b_0
	- zlib=1.2.13=h5eee18b_0
	- pip:
	- aiohttp==3.8.5
	- aiosignal==1.3.1
	- annotated-types==0.6.0
	- anyio==3.7.1
	- appdirs==1.4.4
	- asttokens==2.4.0
	- async-timeout==4.0.3
	- attrs==23.1.0
	- aws-cdk-asset-awscli-v1==2.2.200
	- aws-cdk-asset-kubectl-v20==2.1.2
	- aws-cdk-asset-node-proxy-agent-v6==2.0.1
	- aws-cdk-lib==2.100.0
	- backcall==0.2.0
	- backoff==2.2.1
	- bcrypt==4.0.1
	- beautifulsoup4==4.12.2
	- black==23.9.1
	- cattrs==23.1.2
	- certifi==2023.7.22
	- cfgv==3.4.0
	- charset-normalizer==3.3.0
	- chatbot-class==0.1.0
	- chroma-hnswlib==0.7.3
	- chromadb==0.4.13
	- click==8.1.7
	- coloredlogs==15.0.1
	- constructs==10.2.70
	- dataclasses-json==0.6.1
	- decorator==5.1.1
	- distlib==0.3.7
	- docker-pycreds==0.4.0
	- exceptiongroup==1.1.3
	- executing==2.0.0
	- fastapi==0.103.2
	- filelock==3.12.4
	- flake8==6.1.0
	- flatbuffers==23.5.26
	- frozenlist==1.4.0
	- fsspec==2023.9.2
	- gitdb==4.0.10
	- gitpython==3.1.37
	- greenlet==3.0.0
	- h11==0.14.0
	- httptools==0.6.0
	- huggingface-hub==0.17.3
	- humanfriendly==10.0
	- identify==2.5.30
	- idna==3.4
	- importlib-resources==6.1.0
	- ipython==8.16.1
	- isort==5.12.0
	- jedi==0.19.1
	- jinja2==3.1.2
	- joblib==1.3.2
	- jsii==1.90.0
	- jsonpatch==1.33
	- jsonpointer==2.4
	- langchain==0.0.301
	- langchain-decorators==0.2.3
	- langchainhub==0.1.13
	- langsmith==0.0.43
	- llama-index==0.8.36
	- marko==2.0.0
	- markupsafe==2.1.3
	- marshmallow==3.20.1
	- matplotlib-inline==0.1.6
	- mccabe==0.7.0
	- monotonic==1.6
	- mpmath==1.3.0
	- multidict==6.0.4
	- mypy==1.5.1
	- mypy-extensions==1.0.0
	- nest-asyncio==1.5.8
	- networkx==3.1
	- nltk==3.8.1
	- nodeenv==1.8.0
	- numexpr==2.8.7
	- numpy==1.25.2
	- onnxruntime==1.16.0
	- openai==0.28.1
	- overrides==7.4.0
	- packaging==23.2
	- pandas==2.1.1
	- parso==0.8.3
	- pathspec==0.11.2
	- pathtools==0.1.2
	- pexpect==4.8.0
	- pickleshare==0.7.5
	- platformdirs==3.11.0
	- posthog==3.0.2
	- pre-commit==3.4.0
	- prompt-toolkit==3.0.39
	- promptwatch==0.3.0
	- protobuf==4.24.4
	- psutil==5.9.5
	- ptyprocess==0.7.0
	- publication==0.0.3
	- pulsar-client==3.3.0
	- pure-eval==0.2.2
	- pycodestyle==2.11.0
	- pydantic==2.4.2
	- pydantic-core==2.10.1
	- pyflakes==3.1.0
	- pygments==2.16.1
	- pypdf==3.16.2
	- pypika==0.48.9
	- python-dateutil==2.8.2
	- python-dotenv==1.0.0
	- pytz==2023.3.post1
	- pyyaml==6.0.1
	- regex==2023.10.3
	- requests==2.31.0
	- sentry-sdk==1.31.0
	- setproctitle==1.3.3
	- setuptools==68.2.2
	- six==1.16.0
	- smmap==5.0.1
	- sniffio==1.3.0
	- soupsieve==2.5
	- sqlalchemy==2.0.21
	- stack-data==0.6.3
	- starlette==0.27.0
	- sympy==1.12
	- tenacity==8.2.3
	- tiktoken==0.5.1
	- tokenizers==0.14.1
	- tomli==2.0.1
	- torch==2.1.0
	- tqdm==4.66.1
	- traitlets==5.11.2
	- typeguard==2.13.3
	- typer==0.9.0
	- types-requests==2.31.0.6
	- types-urllib3==1.26.25.14
	- typing-extensions==4.8.0
	- typing-inspect==0.9.0
	- tzdata==2023.3
	- urllib3==1.26.17
	- uvicorn==0.23.2
	- uvloop==0.17.0
	- virtualenv==20.24.5
	- wandb==0.15.12
	- watchfiles==0.20.0
	- wcwidth==0.2.8
	- websockets==11.0.3
	- yarl==1.9.2
	import logging
	import os

	from langchain.chains import ConversationalRetrievalChain
	from langchain.document_loaders import PyPDFDirectoryLoader
	from langchain.embeddings import CacheBackedEmbeddings, OpenAIEmbeddings
	from langchain.llms import OpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.storage import LocalFileStore
	from langchain.vectorstores import OpenSearchVectorSearch

	logging.getLogger("langchain").setLevel(logging.DEBUG)

	# Dropbox folder with academic papers
	PAPER_FOLDER = "/home/rjurney/Dropbox/Academic Papers/"
	assert os.path.exists(PAPER_FOLDER)

	# Set in my ~/.zshrc
	openai_api_key = os.environ.get("OPENAI_API_KEY")
	if not openai_api_key:
	raise ValueError("OPENAI_API_KEY environment variable not set")

	# Load all PDFs from academic paper folder
	loader = PyPDFDirectoryLoader(PAPER_FOLDER, silent_errors=True)
	docs = loader.load()

	# How many papers on network motifs?
	motif_docs = [doc for doc in docs if "motif" in doc.page_content]
	motif_doc_count = len(motif_docs)
	paper_count = len(set(doc.metadata["source"] for doc in motif_docs))
	print(
	f"You have {paper_count} papers on network motifs split across {motif_doc_count} document segments in `{PAPER_FOLDER}`."
	)

	# Embed them with OpenAI ada model and store them in OpenSearch
	embeddings = OpenAIEmbeddings()
	fs = LocalFileStore("./data/embedding_cache/")
	cached_embedder = CacheBackedEmbeddings.from_bytes_store(embeddings, fs, namespace=embeddings.model)

	# Setup a OpenSearch to store the embeddings
	opensearch = OpenSearchVectorSearch(
	index_name="academic_papers",
	embedding_function=cached_embedder,
	opensearch_url="http://localhost:9200",
	)

	# Setup a simple buffer memory system to submit with the API calls to provide prompt context
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	# Create a ConversationalRetrievalChain from the LLM, the vectorstore, and the memory system
	qa = ConversationalRetrievalChain.from_llm(
	OpenAI(temperature=0.8),
	opensearch.as_retriever(),
	memory=memory,
	verbose=True,
	)

	result = qa({"question": "What are the different types of network motif?"})
	print(result)