译者 | 布加迪
审校 | 重楼
首先,我们不会在这个项目中使用Notebook IDE,因为我们希望RAG系统像生产系统一样工作。因此,应该准备一个标准的编程语言IDE,比如Visual Studio Code(VS Code)。
python -m venv rag-env-audio
pip install openai-whisper chromadb sentence-transformers sounddevice numpy scipy PyPDF2 transformers torch langchain-core langchain-community
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
import osimport whisperimport chromadbfrom sentence_transformers import SentenceTransformerimport sounddevice as sdimport numpy as npfrom scipy.io.wavfile import writefrom sklearn.metrics.pairwise import cosine_similarityfrom transformers import AutoModelForCausalLM, AutoTokenizerfrom langchain_text_splitters import RecursiveCharacterTextSplitter import torchAUDIO_FILE = "user_input.wav"RESPONSE_AUDIO_FILE = "response.wav" PDF_FILE = "Insurance_Handbook_20103.pdf" SAMPLE_RATE = 16000WAKE_WORD = "Hi" SIMILARITY_THRESHOLD = 0.4 MAX_ATTEMPTS = 5
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
第一步是创建一项功能来记录输入语音,并将语音转录成文本数据。我们将使用声音设备库用于记录语音,使用OpenAI Whisper用于音频转录。
# For recording audio input.def record_audio(filename, duration=5, samplerate=SAMPLE_RATE): print("Listening... Speak now!") audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32') sd.wait() print("Recording finished.") write(filename, samplerate, (audio * 32767).astype(np.int16))# Transcribe the Input audio into text def transcribe_audio(filename): print("Transcribing audio...") model = whisper.load_model("base.en") result = model.transcribe(filename) return result["text"].strip().lower()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
# Detecting Wake Word to activate the RAG Systemdef detect_wake_word(max_attempts=MAX_ATTEMPTS): print("Waiting for wake word...") text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2') wake_word_embedding = text_embedding_model.encode(WAKE_WORD).reshape(1, -1) attempts = 0 while attempts = SIMILARITY_THRESHOLD: print(f"Wake word detected: {WAKE_WORD}") return True attempts += 1 print(f"Attempt {attempts}/{max_attempts}. Please try again.") print("Wake word not detected. Exiting.") return False
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
def load_and_chunk_pdf(pdf_file): from PyPDF2 import PdfReader print("Loading and chunking PDF...") reader = PdfReader(pdf_file) all_text = "" for page in reader.pages: text = page.extract_text() if text: all_text += text + "\n" # Split the text into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=250, # Size of each chunk chunk_overlap=50, # Overlap between chunks to maintain context separators=["\n\n", "\n", " ", ""] ) chunks = text_splitter.split_text(all_text) return chunks
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
def setup_chromadb(chunks): print("Setting up ChromaDB...") client = chromadb.PersistentClient(path="chroma_db") text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Delete existing collection (if needed) try: client.delete_collection(name="knowledge_base") print("Deleted existing collection: knowledge_base") except Exception as e: print(f"Collection does not exist or could not be deleted: {e}") collection = client.create_collection(name="knowledge_base") for i, chunk in enumerate(chunks): embedding = text_embedding_model.encode(chunk).tolist() collection.add( ids=[f"chunk_{i}"], embeddings=[embedding], metadatas=[{"source": "pdf", "chunk_id": i}], documents=[chunk] ) print("Text chunks and embeddings stored in ChromaDB.") return collectionAdditionally, we will prepare the function for retrieval with the text query to ChromaDB as wellldef query_chromadb(collection, query, top_k=3): """Query ChromaDB for relevant chunks.""" text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2') query_embedding = text_embedding_model.encode(query).tolist() results = collection.query( query_embeddings=[query_embedding], n_results=top_k ) relevant_chunks = [chunk for sublist in results["documents"] for chunk in sublist] return relevant_chunks
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
然后,我们需要准备生成功能来完成RAG系统。在本例中,我将使用托管在HuggingFace中的Qwen -1.5-0.5B-Chat模型。你可以根据需要调整提示和生成模型。
def generate_response(query, context_chunks): device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "Qwen/Qwen1.5-0.5B-Chat" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) # Format the prompt with the query and context context = "\n".join(context_chunks) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"Use the following context to answer the question:\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer( [text], return_tensors="pt", padding=True, truncation=True ).to(device) # Generate the response generated_ids = model.generate( model_inputs.input_ids, attention_mask=model_inputs.attention_mask, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
最后,令人兴奋的地方在于使用文本到语音模型将生成的响应转换成音频文件。就本例而言,我们将使用托管在HuggingFace中的Suno Bark模型。在生成音频之后,我们将播放音频响应以完成整条管道。
def text_to_speech(text, output_file): from transformers import AutoProcessor, BarkModel print("Generating speech...") processor = AutoProcessor.from_pretrained("suno/bark-small") model = BarkModel.from_pretrained("suno/bark-small") inputs = processor(text, return_tensors="pt") audio_array = model.generate(**inputs) audio = audio_array.cpu().numpy().squeeze() # Save the audio to a file write(output_file, 22050, (audio * 32767).astype(np.int16)) print(f"Audio response saved to {output_file}") return audiodef play_audio(audio, samplerate=22050): print("Playing response...") sd.play(audio, samplerate=samplerate) sd.wait()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
def main(): # Step 1: Load and chunk the PDF chunks = load_and_chunk_pdf(PDF_FILE) # Step 2: Set up ChromaDB collection = setup_chromadb(chunks) # Step 3: Detect wake word with embedding similarity if not detect_wake_word(): return # Exit if wake word is not detected # Step 4: Record and transcribe user input record_audio(AUDIO_FILE, duration=5) user_input = transcribe_audio(AUDIO_FILE) print(f"User Input: {user_input}") # Step 5: Query ChromaDB for relevant chunks relevant_chunks = query_chromadb(collection, user_input) print(f"Relevant Chunks: {relevant_chunks}") # Step 6: Generate response using a Hugging Face model response = generate_response(user_input, relevant_chunks) print(f"Generated Response: {response}") # Step 7: Convert response to speech, save it, and play it audio = text_to_speech(response, RESPONSE_AUDIO_FILE) play_audio(audio) # Clean up os.remove(AUDIO_FILE) # Delete the temporary audio fileif __name__ == "__main__": main()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
python app.py
原文标题:Creating a Useful Voice-Activated Fully Local RAG System,作者:Cornellius Yudha Wijaya |