llm-py-web/server/inference.py

import llm, llm.cli, sqlite_utils
from .http import Request, JSONResponse, WebSocket
from .tid import tid_now
import json

db = sqlite_utils.Database(llm.cli.logs_db_path())
girlypop_prompt = llm.cli.load_template("girlypop").system


async def list_conversations(request: Request):
  conversations = []
  for row in db["conversations"].rows:
    conversations.append({ "id": row["id"], "name": row["name"] })
  return JSONResponse(conversations)


async def connect_to_conversation(ws: WebSocket):
  continuing = bool(ws.query_params.get("continue"))
  conversation_id = ws.path_params["conversation"]
  if conversation_id == "new":
    conversation = llm.AsyncConversation(llm.get_async_model())
  else:
    try:
      conversation: llm.AsyncConversation = llm.cli.load_conversation(conversation_id, async_=True)
    except:
      await ws.send_denial_response(JSONResponse({
        "error": "unable to load conversation {}".format(conversation_id)
      }, status_code=404))
      return

  await ws.accept()

  # only send the system prompt at the start of a conversation
  system_prompt = girlypop_prompt

  if not continuing:
    for response in conversation.responses:
      response: llm.AsyncResponse = response
      if not response._done: continue
      if response.prompt.system:
        system_prompt = None
      await ws.send_text(json.dumps({"u": response.prompt.prompt})) # user
      await ws.send_text(json.dumps({"f": response.text_or_raise()})) # full

  async for message in ws.iter_text():
    response = conversation.prompt(message, system=system_prompt, stream=True)
    system_prompt = None

    response_tid = tid_now()
    await ws.send_text(json.dumps({"u": message}))
    await ws.send_text(json.dumps({"s": response_tid})) # start
    async for chunk in response:
      await ws.send_text(json.dumps({"r": response_tid, "c": chunk}))
    await ws.send_text(json.dumps({"d": response_tid})) # done
    (await response.to_sync_response()).log_to_db(db)