Skip to content

Instantly share code, notes, and snippets.

@DhanshreeA
Last active October 23, 2024 09:14
Show Gist options
  • Select an option

  • Save DhanshreeA/6773f4ba7d07b8d5da4b484f9eea68d8 to your computer and use it in GitHub Desktop.

Select an option

Save DhanshreeA/6773f4ba7d07b8d5da4b484f9eea68d8 to your computer and use it in GitHub Desktop.
Llamafile with LlamaIndex bug repro scenario
# This DOES NOT work.
# The complete method exits with a TimeOut
# The Llamafile server becomes unresponsive with no active GPU utilisation.
from llama_index.core import PromptTemplate
from llama_index.llms.llamafile import Llamafile
BASE_URL = "http://127.0.0.1:8080"
EOF_TOKEN = "<|eot_id|>"
class Summarizer:
def __init__(self) -> None:
self.llm = Llamafile(base_url=BASE_URL)
self.response_header = """
This is my understanding of your query:
"""
self.response_format = f"""
When generating a response you should separate each line by a newline character and use numerical bullets.
Each line should be in second person.
Always begin your response with the following line:
{self.response_header}
"""
self.summary_prompt_template_str = """
You are a helpful assistant who can summarize what is being asked.
Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
Query: {query}
"""
self.llm.system_prompt = self.response_format
self.summary_gen_prompt = PromptTemplate(
template=self.summary_prompt_template_str,
prompt_type="summary"
)
def summarize(self, query: str):
prompt = self.summary_gen_prompt.format(query=query)
return self.llm.complete(prompt=prompt, stop=[EOF_TOKEN])
if __name__ == "__main__":
summarizer = Summarizer()
query = "What is the capital of France?"
print(summarizer.summarize(query))
# This WORKS
# Uses the /completion API with streaming set to True
from llama_index.core import PromptTemplate
from llama_index.llms.llamafile import Llamafile
BASE_URL = "http://127.0.0.1:8080"
EOF_TOKEN = "<|eot_id|>"
class Summarizer:
def __init__(self) -> None:
self.llm = Llamafile(base_url=BASE_URL)
self.response_header = """
This is my understanding of your query:
"""
self.response_format = f"""
When generating a response you should separate each line by a newline character and use numerical bullets.
Each line should be in second person.
Always begin your response with the following line:
{self.response_header}
"""
self.summary_prompt_template_str = """
You are a helpful assistant who can summarize what is being asked.
Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
Query: {query}
"""
self.llm.system_prompt = self.response_format
self.summary_gen_prompt = PromptTemplate(
template=self.summary_prompt_template_str,
prompt_type="summary"
)
def summarize(self, query: str):
prompt = self.summary_gen_prompt.format(query=query)
for chunk in self.llm.stream_complete(prompt=prompt, stop=[EOF_TOKEN]):
yield chunk.delta
if __name__ == "__main__":
summarizer = Summarizer()
query = "What is the capital of France?"
for chunk in summarizer.summarize(query):
print(chunk)
# This WORKS as well.
# Same api (/v1/chat/completions), with streaming set to False
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import PromptTemplate
from llama_index.llms.llamafile import Llamafile
BASE_URL = "http://127.0.0.1:8080"
EOF_TOKEN = "<|eot_id|>"
class Summarizer:
def __init__(self) -> None:
self.llm = Llamafile(base_url=BASE_URL)
self.response_header = """
This is my understanding of your query:
"""
self.response_format = f"""
When generating a response you should separate each line by a newline character and use numerical bullets.
Each line should be in second person.
Always begin your response with the following line:
{self.response_header}
"""
self.summary_prompt_template_str = """
You are a helpful assistant who can summarize what is being asked.
Based on this instruction, generate a summary of what is asked, \
one element on each line, related to the following input query:
Query: {query}
"""
self.system_message = ChatMessage(
role=MessageRole.SYSTEM,
content=self.response_format,
)
self.summary_gen_prompt = PromptTemplate(
template=self.summary_prompt_template_str,
prompt_type="summary"
)
def summarize(self, query: str):
messages = [
self.system_message,
self.summary_gen_prompt.format_messages(query=query)[0]
]
return self.llm.chat(messages=messages, stop=[EOF_TOKEN])
if __name__ == "__main__":
summarizer = Summarizer()
query = "What is the capital of France?"
print(summarizer.summarize(query))
# This WORKS.
# This script uses the stream_chat api from LlamaIndex which is basically just a wrapper around the /v1/chat/completions API
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import PromptTemplate
from llama_index.llms.llamafile import Llamafile
BASE_URL = "http://127.0.0.1:8080"
EOF_TOKEN = "<|eot_id|>"
class Summarizer:
def __init__(self) -> None:
self.llm = Llamafile(base_url=BASE_URL)
self.response_header = """
This is my understanding of your query:
"""
self.response_format = f"""
When generating a response you should separate each line by a newline character and use numerical bullets.
Each line should be in second person.
Always begin your response with the following line:
{self.response_header}
"""
self.summary_prompt_template_str = """
You are a helpful assistant who can summarize what is being asked.
Based on this instruction, generate a summary of what is asked, \
one element on each line, related to the following input query:
Query: {query}
"""
self.system_message = ChatMessage(
role=MessageRole.SYSTEM,
content=self.response_format,
)
self.summary_gen_prompt = PromptTemplate(
template=self.summary_prompt_template_str,
prompt_type="summary"
)
def summarize(self, query: str):
messages = [
self.system_message,
self.summary_gen_prompt.format_messages(query=query)[0]
]
for chunk in self.llm.stream_chat(messages=messages, stop=[EOF_TOKEN]):
yield chunk.delta
if __name__ == "__main__":
summarizer = Summarizer()
query = "What is the capital of France?"
for chunk in summarizer.summarize(query):
print(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment