DhanshreeA · October 23, 2024 09:14
diff --git a/llamafile_completions.py b/llamafile_completions.py
 # This DOES NOT work.
 # The complete method exits with a TimeOut
 # The Llamafile server becomes unresponsive with no active GPU utilisation.

 from llama_index.core import PromptTemplate
 from llama_index.llms.llamafile import Llamafile

 BASE_URL = "http://127.0.0.1:8080"
 EOF_TOKEN = "<|eot_id|>"

 class Summarizer:
    def __init__(self) -> None:

        self.llm = Llamafile(base_url=BASE_URL)
        self.response_header = """
        This is my understanding of your query:
        """
        
        self.response_format = f"""
        When generating a response you should separate each line by a newline character and use numerical bullets.
        Each line should be in second person.
        Always begin your response with the following line:
        {self.response_header}
        """

        self.summary_prompt_template_str = """
        You are a helpful assistant who can summarize what is being asked.
        Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
        Query: {query}
        """
        self.llm.system_prompt = self.response_format
        self.summary_gen_prompt = PromptTemplate(
            template=self.summary_prompt_template_str,
            prompt_type="summary"
        )

    def summarize(self, query: str):
        prompt = self.summary_gen_prompt.format(query=query)
        return self.llm.complete(prompt=prompt, stop=[EOF_TOKEN])


 if __name__ == "__main__":
    summarizer = Summarizer()
    query = "What is the capital of France?"
    print(summarizer.summarize(query))
diff --git a/llamafile_completions_streaming.py b/llamafile_completions_streaming.py
 # This WORKS
 # Uses the /completion API with streaming set to True

 from llama_index.core import PromptTemplate
 from llama_index.llms.llamafile import Llamafile

 BASE_URL = "http://127.0.0.1:8080"
 EOF_TOKEN = "<|eot_id|>"

 class Summarizer:
    def __init__(self) -> None:

        self.llm = Llamafile(base_url=BASE_URL)
        self.response_header = """
        This is my understanding of your query:
        """
        
        self.response_format = f"""
        When generating a response you should separate each line by a newline character and use numerical bullets.
        Each line should be in second person.
        Always begin your response with the following line:
        {self.response_header}
        """

        self.summary_prompt_template_str = """
        You are a helpful assistant who can summarize what is being asked.
        Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
        Query: {query}
        """
        self.llm.system_prompt = self.response_format
        self.summary_gen_prompt = PromptTemplate(
            template=self.summary_prompt_template_str,
            prompt_type="summary"
        )

    def summarize(self, query: str):
        prompt = self.summary_gen_prompt.format(query=query)
        for chunk in self.llm.stream_complete(prompt=prompt, stop=[EOF_TOKEN]):
            yield chunk.delta


 if __name__ == "__main__":
    summarizer = Summarizer()
    query = "What is the capital of France?"
    for chunk in summarizer.summarize(query):
       print(chunk)
diff --git a/llamafile_openai_api.py b/llamafile_openai_api.py
 # This WORKS as well.
 # Same api (/v1/chat/completions), with streaming set to False

 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core import PromptTemplate
 from llama_index.llms.llamafile import Llamafile

 BASE_URL = "http://127.0.0.1:8080"
 EOF_TOKEN = "<|eot_id|>"

 class Summarizer:
    def __init__(self) -> None:

        self.llm = Llamafile(base_url=BASE_URL)
        self.response_header = """
        This is my understanding of your query:
        """
        
        self.response_format = f"""
        When generating a response you should separate each line by a newline character and use numerical bullets.
        Each line should be in second person.
        Always begin your response with the following line:
        {self.response_header}
        """

        self.summary_prompt_template_str = """
        You are a helpful assistant who can summarize what is being asked.
        Based on this instruction, generate a summary of what is asked, \
        one element on each line, related to the following input query:
        Query: {query}
        """
        self.system_message = ChatMessage(
            role=MessageRole.SYSTEM,
            content=self.response_format,
        )
        self.summary_gen_prompt = PromptTemplate(
            template=self.summary_prompt_template_str,
            prompt_type="summary"
        )

    def summarize(self, query: str):
        messages = [
            self.system_message,
            self.summary_gen_prompt.format_messages(query=query)[0]
        ]
        return self.llm.chat(messages=messages, stop=[EOF_TOKEN])


 if __name__ == "__main__":
    summarizer = Summarizer()
    query = "What is the capital of France?"
    print(summarizer.summarize(query))
diff --git a/llamafile_openai_api_streaming.py b/llamafile_openai_api_streaming.py
 # This WORKS.
 # This script uses the stream_chat api from LlamaIndex which is basically just a wrapper around the /v1/chat/completions API

 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core import PromptTemplate
 from llama_index.llms.llamafile import Llamafile

 BASE_URL = "http://127.0.0.1:8080"
 EOF_TOKEN = "<|eot_id|>"

 class Summarizer:
    def __init__(self) -> None:

        self.llm = Llamafile(base_url=BASE_URL)
        self.response_header = """
        This is my understanding of your query:
        """
        
        self.response_format = f"""
        When generating a response you should separate each line by a newline character and use numerical bullets.
        Each line should be in second person.
        Always begin your response with the following line:
        {self.response_header}
        """

        self.summary_prompt_template_str = """
        You are a helpful assistant who can summarize what is being asked.
        Based on this instruction, generate a summary of what is asked, \
        one element on each line, related to the following input query:
        Query: {query}
        """
        self.system_message = ChatMessage(
            role=MessageRole.SYSTEM,
            content=self.response_format,
        )
        self.summary_gen_prompt = PromptTemplate(
            template=self.summary_prompt_template_str,
            prompt_type="summary"
        )

    def summarize(self, query: str):
        messages = [
            self.system_message,
            self.summary_gen_prompt.format_messages(query=query)[0]
        ]
        for chunk in self.llm.stream_chat(messages=messages, stop=[EOF_TOKEN]):
            yield chunk.delta


 if __name__ == "__main__":
    summarizer = Summarizer()
    query = "What is the capital of France?"
    for chunk in summarizer.summarize(query):
        print(chunk)
	# This DOES NOT work.
	# The complete method exits with a TimeOut
	# The Llamafile server becomes unresponsive with no active GPU utilisation.

	from llama_index.core import PromptTemplate
	from llama_index.llms.llamafile import Llamafile

	BASE_URL = "http://127.0.0.1:8080"
	EOF_TOKEN = "<\|eot_id\|>"

	class Summarizer:
	def __init__(self) -> None:

	self.llm = Llamafile(base_url=BASE_URL)
	self.response_header = """
	This is my understanding of your query:
	"""

	self.response_format = f"""
	When generating a response you should separate each line by a newline character and use numerical bullets.
	Each line should be in second person.
	Always begin your response with the following line:
	{self.response_header}
	"""

	self.summary_prompt_template_str = """
	You are a helpful assistant who can summarize what is being asked.
	Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
	Query: {query}
	"""
	self.llm.system_prompt = self.response_format
	self.summary_gen_prompt = PromptTemplate(
	template=self.summary_prompt_template_str,
	prompt_type="summary"
	)

	def summarize(self, query: str):
	prompt = self.summary_gen_prompt.format(query=query)
	return self.llm.complete(prompt=prompt, stop=[EOF_TOKEN])


	if __name__ == "__main__":
	summarizer = Summarizer()
	query = "What is the capital of France?"
	print(summarizer.summarize(query))
	# This WORKS
	# Uses the /completion API with streaming set to True

	from llama_index.core import PromptTemplate
	from llama_index.llms.llamafile import Llamafile

	BASE_URL = "http://127.0.0.1:8080"
	EOF_TOKEN = "<\|eot_id\|>"

	class Summarizer:
	def __init__(self) -> None:

	self.llm = Llamafile(base_url=BASE_URL)
	self.response_header = """
	This is my understanding of your query:
	"""

	self.response_format = f"""
	When generating a response you should separate each line by a newline character and use numerical bullets.
	Each line should be in second person.
	Always begin your response with the following line:
	{self.response_header}
	"""

	self.summary_prompt_template_str = """
	You are a helpful assistant who can summarize what is being asked.
	Based on this instruction, generate a summary of what is asked, one element on each line, related to the following input query:
	Query: {query}
	"""
	self.llm.system_prompt = self.response_format
	self.summary_gen_prompt = PromptTemplate(
	template=self.summary_prompt_template_str,
	prompt_type="summary"
	)

	def summarize(self, query: str):
	prompt = self.summary_gen_prompt.format(query=query)
	for chunk in self.llm.stream_complete(prompt=prompt, stop=[EOF_TOKEN]):
	yield chunk.delta


	if __name__ == "__main__":
	summarizer = Summarizer()
	query = "What is the capital of France?"
	for chunk in summarizer.summarize(query):
	print(chunk)
	# This WORKS as well.
	# Same api (/v1/chat/completions), with streaming set to False

	from llama_index.core.llms import ChatMessage, MessageRole
	from llama_index.core import PromptTemplate
	from llama_index.llms.llamafile import Llamafile

	BASE_URL = "http://127.0.0.1:8080"
	EOF_TOKEN = "<\|eot_id\|>"

	class Summarizer:
	def __init__(self) -> None:

	self.llm = Llamafile(base_url=BASE_URL)
	self.response_header = """
	This is my understanding of your query:
	"""

	self.response_format = f"""
	When generating a response you should separate each line by a newline character and use numerical bullets.
	Each line should be in second person.
	Always begin your response with the following line:
	{self.response_header}
	"""

	self.summary_prompt_template_str = """
	You are a helpful assistant who can summarize what is being asked.
	Based on this instruction, generate a summary of what is asked, \
	one element on each line, related to the following input query:
	Query: {query}
	"""
	self.system_message = ChatMessage(
	role=MessageRole.SYSTEM,
	content=self.response_format,
	)
	self.summary_gen_prompt = PromptTemplate(
	template=self.summary_prompt_template_str,
	prompt_type="summary"
	)

	def summarize(self, query: str):
	messages = [
	self.system_message,
	self.summary_gen_prompt.format_messages(query=query)[0]
	]
	return self.llm.chat(messages=messages, stop=[EOF_TOKEN])


	if __name__ == "__main__":
	summarizer = Summarizer()
	query = "What is the capital of France?"
	print(summarizer.summarize(query))
	# This WORKS.
	# This script uses the stream_chat api from LlamaIndex which is basically just a wrapper around the /v1/chat/completions API

	from llama_index.core.llms import ChatMessage, MessageRole
	from llama_index.core import PromptTemplate
	from llama_index.llms.llamafile import Llamafile

	BASE_URL = "http://127.0.0.1:8080"
	EOF_TOKEN = "<\|eot_id\|>"

	class Summarizer:
	def __init__(self) -> None:

	self.llm = Llamafile(base_url=BASE_URL)
	self.response_header = """
	This is my understanding of your query:
	"""

	self.response_format = f"""
	When generating a response you should separate each line by a newline character and use numerical bullets.
	Each line should be in second person.
	Always begin your response with the following line:
	{self.response_header}
	"""

	self.summary_prompt_template_str = """
	You are a helpful assistant who can summarize what is being asked.
	Based on this instruction, generate a summary of what is asked, \
	one element on each line, related to the following input query:
	Query: {query}
	"""
	self.system_message = ChatMessage(
	role=MessageRole.SYSTEM,
	content=self.response_format,
	)
	self.summary_gen_prompt = PromptTemplate(
	template=self.summary_prompt_template_str,
	prompt_type="summary"
	)

	def summarize(self, query: str):
	messages = [
	self.system_message,
	self.summary_gen_prompt.format_messages(query=query)[0]
	]
	for chunk in self.llm.stream_chat(messages=messages, stop=[EOF_TOKEN]):
	yield chunk.delta


	if __name__ == "__main__":
	summarizer = Summarizer()
	query = "What is the capital of France?"
	for chunk in summarizer.summarize(query):
	print(chunk)