Skip to content

Instantly share code, notes, and snippets.

@acidsound
Created August 28, 2023 18:04
Show Gist options
  • Save acidsound/187d8f917027bcfbcfb2434bce529bce to your computer and use it in GitHub Desktop.
Save acidsound/187d8f917027bcfbcfb2434bce529bce to your computer and use it in GitHub Desktop.
exllama 사용법 +qdrant 색인&검색
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client.models import PointStruct
# Initialize the client
client = QdrantClient("localhost:6333")
COLLECTION_NAME = "docs"
client.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", device="cuda")
sentences = ['This framework generates embeddings for each input sentence',
'Sentences are passed as a list of string.',
'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)
client.upsert(
collection_name="docs",
points=[
PointStruct(
id=idx,
vector=vector.tolist(),
payload={"color": "red", "rand_number": idx % 10}
)
for idx, vector in enumerate(sentence_embeddings)
]
)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
from exllama.lora import ExLlamaLora
from exllama.tokenizer import ExLlamaTokenizer
from exllama.generator import ExLlamaGenerator
import torch
import datetime
def log(message):
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] {message}")
torch.set_grad_enabled(False)
torch.cuda._lazy_init()
config_path = "./models/Scarlett-13B-GPTQ"
config = ExLlamaConfig(f"{config_path}/config.json")
config.model_path = f"{config_path}/model.safetensors"
config.model_path = f"{config_path}/model.safetensors"
model = ExLlama(config)
log(">>> model loading...")
model = ExLlama(config)
log(">>> model caching...")
cache = ExLlamaCache(model)
log(">>> model tokenizing...")
tokenizer = ExLlamaTokenizer(f"{config_path}/tokenizer.model")
log(">>> model generating...")
generator = ExLlamaGenerator(model, tokenizer, cache)
# generator.disallow_tokens([tokenizer.eos_token_id])
# generator.settings.token_repetition_penalty_max = 1.2
# generator.settings.temperature = 0.95
# generator.settings.top_p = 0.65
# generator.settings.top_k = 100
# generator.settings.typical = 0.5
# Produce a simple generation
log(">>> make simple autocomplete..")
prompt = "Once upon a time,"
log (prompt)
output = generator.generate_simple(prompt, max_new_tokens = 200)
log(output[len(prompt):])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment