译者 | 朱先忠审校 | 重楼
pip install --upgrade cleanlab-studio
from cleanlab_studio import Studiostudio = Studio("") # 从上面获取您的API密钥tlm = studio.TLM(options={"log": ["explanation"], "model": "gpt-4o"}) # GPT, Claude, etc#设置提示out = tlm.prompt("How many vowels are there in the word 'Abracadabra'.?")#TLM响应包含实际输出的“响应”、可信度评分和解释print(f"Model's response = {out['response']}")print(f"Trustworthiness score = {out['trustworthiness_score']}")print(f"Explanation = {out['log']['explanation']}")
Model's response = The word "Abracadabra" contains 6 vowels. The vowels are: A, a, a, a, a, and a.Trustworthiness score = 0.6842228802750124Explanation = This response is untrustworthy due to a lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either):5.
Model's response = Let me count the vowels in 'Abracadabra':A-b-r-a-c-a-d-a-b-r-aThe vowels are: A, a, a, a, aThere are 5 vowels in the word 'Abracadabra'.Trustworthiness score = 0.9378276048845285Explanation = Did not find a reason to doubt trustworthiness.
pythonfrom cleanlab_studio import Studioimport markdownfrom IPython.core.display import display, Markdown# 使用API密钥初始化Cleanlab Studiostudio = Studio("") #替换为您的实际API密钥# 要评估的模型列表models = ["gpt-4o", "claude-3.5-sonnet-v2"]# 定义提示prompt_text = "Which one of 9.11 and 9.9 is bigger?"# 遍历每个模型并进行评估for model in models: tlm = studio.TLM(options={"log": ["explanation"], "model": model}) out = tlm.prompt(prompt_text) md_content = f"""## 模型: {model}**响应**: {out['response']}**可信度评分**: {out['trustworthiness_score']}**解释**: {out['log']['explanation']}---""" display(Markdown(md_content))
import streamlit as stfrom langchain_groq.chat_models import ChatGroqimport osos.environ["GROQ_API_KEY"]=st.secrets["GROQ_API_KEY"]#初始化Groq Llama即时模型groq_llm = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0.5)prompt = "Which one of 9.11 and 9.9 is bigger?"# Get the response from the modelresponse = groq_llm.invoke(prompt)#初始化Cleanlab的studiostudio = Studio("226eeab91e944b23bd817a46dbe3c8ae") cleanlab_tlm = studio.TLM(optinotallow={"log": ["explanation"]}) #供解释#得到包含可信度得分和解释的输出output = cleanlab_tlm.get_trustworthiness_score(prompt, respnotallow=response.content.strip())md_content = f"""## 模型: {model}**Response:** {response.content.strip()}**Trustworthiness Score:** {output['trustworthiness_score']}**Explanation:** {output['log']['explanation']}---"""display(Markdown(md_content))
pip install llama-parse llama-index-core llama-index-embeddings-huggingface llama-index-llms-cleanlab requests beautifulsoup4 pdfkit nest-asyncio
from llama_parse import LlamaParsefrom llama_index.core import VectorStoreIndeximport requestsfrom bs4 import BeautifulSoupimport pdfkitfrom llama_index.readers.docling import DoclingReaderfrom llama_index.core import Settingsfrom llama_index.embeddings.huggingface import HuggingFaceEmbeddingfrom llama_index.core import VectorStoreIndex, SimpleDirectoryReaderfrom llama_index.llms.cleanlab import CleanlabTLMfrom typing import Dict, List, ClassVarfrom llama_index.core.instrumentation.events import BaseEventfrom llama_index.core.instrumentation.event_handlers import BaseEventHandlerfrom llama_index.core.instrumentation import get_dispatcherfrom llama_index.core.instrumentation.events.llm import LLMCompletionEndEventimport nest_asyncioimport os
options = { "model": "gpt-4o", "max_tokens": 512, "log": ["explanation"]}llm = CleanlabTLM(api_key="", optinotallow=options) # 从https://cleanlab.ai/获取您的免费APISettings.llm = llmSettings.embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-small-en-v1.5")
# 可信度评分事件处理程序class GetTrustworthinessScore(BaseEventHandler): events: ClassVar[List[BaseEvent]] = [] trustworthiness_score: float = 0.0 @classmethod def class_name(cls) -> str: return "GetTrustworthinessScore" def handle(self, event: BaseEvent) -> Dict: if isinstance(event, LLMCompletionEndEvent): self.trustworthiness_score = event.response.additional_kwargs.get("trustworthiness_score", 0.0) self.events.append(event) return {}# 显示LLM响应的辅助函数def display_response(response): response_str = response.response trustworthiness_score = event_handler.trustworthiness_score print(f"Response: {response_str}") print(f"Trustworthiness score: {round(trustworthiness_score, 2)}")
接下来,我们将通过从给定的URL抓取数据来生成PDF。为了演示目的,我们仅从这篇关于大语言模型的维基百科文章(遵循Creative Commons Attribution-ShareAlike 4.0许可)抓取数据。
########################################### 从多个URL生成PDF########################################### 配置wkhtmltopdf路径wkhtml_path = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'config = pdfkit.configuration(wkhtmltopdf=wkhtml_path)# 定义URL和分配文档名称urls = { "LLMs": "https://en.wikipedia.org/wiki/Large_language_model"}# 保存PDF的目录pdf_directory = "PDFs"os.makedirs(pdf_directory, exist_ok=True)pdf_paths = {}for doc_name, url in urls.items(): try: print(f"Processing {doc_name} from {url} ...") response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") main_content = soup.find("div", {"id": "mw-content-text"}) if main_content is None: raise ValueError("Main content not found") # 将协议相对URL替换为绝对URL html_string = str(main_content).replace('src="https://', 'src="https://').replace('href="https://', 'href="https://') pdf_file_path = os.path.join(pdf_directory, f"{doc_name}.pdf") pdfkit.from_string( html_string, pdf_file_path, optinotallow={'encoding': 'UTF-8', 'quiet': ''}, cnotallow=config ) pdf_paths[doc_name] = pdf_file_path print(f"Saved PDF for {doc_name} at {pdf_file_path}") except Exception as e: print(f"Error processing {doc_name}: {e}")
########################################### 使用LlamaParse解析PDF并注入元数据########################################### 定义解析指令(如果您的解析器支持)parsing_instructions = """提取文档的markdown格式内容。按页将文档拆分为节点(例如)。确保每个节点具有文档名称和页码的元数据。"""# 创建LlamaParse实例parser = LlamaParse( api_key="", # 替换为您的实际密钥 parsing_instructinotallow=parsing_instructions, result_type="markdown", premium_mode=True, max_timeout=600)# 保存合并的Markdown文件的目录(每个PDF一个)output_md_dir = os.path.join(pdf_directory, "markdown_docs")os.makedirs(output_md_dir, exist_ok=True)# 列表,用于保存所有更新后的节点以供索引all_nodes = []for doc_name, pdf_path in pdf_paths.items(): try: print(f"Parsing PDF for {doc_name} from {pdf_path} ...") nodes = parser.load_data(pdf_path) # 返回节点列表 updated_nodes = [] # 处理每个节点:更新元数据并在文本中注入引用标题。 for i, node in enumerate(nodes, start=1): # 复制现有元数据(如果有),并添加我们自己的键。 new_metadata = dict(node.metadata) if node.metadata else {} new_metadata["document_name"] = doc_name if "page_number" not in new_metadata: new_metadata["page_number"] = str(i) # 构建引用标题。 citation_header = f"[{new_metadata['document_name']}, page {new_metadata['page_number']}]\n\n" # 在节点的文本前添加引用标题。 updated_text = citation_header + node.text new_node = node.__class__(text=updated_text, metadata=new_metadata) updated_nodes.append(new_node) # 使用更新后的节点文本为文档保存一个合并的Markdown文件。 combined_texts = [node.text for node in updated_nodes] combined_md = "\n\n---\n\n".join(combined_texts) md_filename = f"{doc_name}.md" md_filepath = os.path.join(output_md_dir, md_filename) with open(md_filepath, "w", encoding="utf-8") as f: f.write(combined_md) print(f"Saved combined markdown for {doc_name} to {md_filepath}") # 将更新后的节点添加到全局列表以供索引。 all_nodes.extend(updated_nodes) print(f"Parsed {len(updated_nodes)} nodes from {doc_name}.") except Exception as e: print(f"Error parsing {doc_name}: {e}")
########################################### 创建索引和查询引擎########################################### 从所有节点创建索引。index = VectorStoreIndex.from_documents(documents=all_nodes)# 定义一个自定义提示模板,强制包含引用。prompt_template = """你是一个具有主题专业知识的AI助手。仅使用提供的上下文回答问题。在必要时,以格式良好的Markdown格式回答,包含项目符号和章节。如果提供的上下文不支持答案,请回复“我不知道。”上下文:{context_str}问题:{query_str}答案:"""# 使用自定义提示创建查询引擎。query_engine = index.as_query_engine(similarity_top_k=3, llm=llm, prompt_template=prompt_template)print("Combined index and query engine created successfully!")
query = "When is mixture of experts approach used?"response = query_engine.query(query)display_response(response)
query = "How do you compare Deepseek model with OpenAI's models?"response = query_engine.query(query)display_response(response)
回答“How do you compare the Deepseek model with OpenAI’s models?(您如何将Deepseek模型与OpenAI的模型进行比较?)”的问题(作者提供的图片)
原文标题:How to Measure the Reliability of a Large Language Model’s Response,作者:Umair Ali Khan |