llm-py-web/server/inference.py

55 lines
2 KiB
Python

import llm, llm.cli, sqlite_utils
from .http import Request, JSONResponse, WebSocket
from .tid import tid_now
import json
db = sqlite_utils.Database(llm.cli.logs_db_path())
girlypop_prompt = llm.cli.load_template("girlypop").system
async def list_conversations(request: Request):
conversations = []
for row in db["conversations"].rows:
conversations.append({ "id": row["id"], "name": row["name"] })
return JSONResponse(conversations)
async def connect_to_conversation(ws: WebSocket):
continuing = bool(ws.query_params.get("continue"))
conversation_id = ws.path_params["conversation"]
if conversation_id == "new":
conversation = llm.AsyncConversation(llm.get_async_model())
else:
try:
conversation: llm.AsyncConversation = llm.cli.load_conversation(conversation_id, async_=True)
except:
await ws.send_denial_response(JSONResponse({
"error": "unable to load conversation {}".format(conversation_id)
}, status_code=404))
return
await ws.accept()
# only send the system prompt at the start of a conversation
system_prompt = girlypop_prompt
if not continuing:
for response in conversation.responses:
response: llm.AsyncResponse = response
if not response._done: continue
if response.prompt.system:
system_prompt = None
await ws.send_text(json.dumps({"u": response.prompt.prompt})) # user
await ws.send_text(json.dumps({"f": response.text_or_raise()})) # full
async for message in ws.iter_text():
response = conversation.prompt(message, system=system_prompt, stream=True)
system_prompt = None
response_tid = tid_now()
await ws.send_text(json.dumps({"u": message}))
await ws.send_text(json.dumps({"s": response_tid})) # start
async for chunk in response:
await ws.send_text(json.dumps({"r": response_tid, "c": chunk}))
await ws.send_text(json.dumps({"d": response_tid})) # done
(await response.to_sync_response()).log_to_db(db)