앙상블 리트리버-bm25(형태소 추가)+faiss, 명사 추출
from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document
from konlpy.tag import Okt
from kiwipiepy import Kiwi
kiwi = Kiwi()
def kiwi_tokenize(text):
return [token.form for token in kiwi.tokenize(text)]
kiwi_bm25 = BM25Retriever.from_texts(texts, preprocess_func=kiwi_tokenize)
faiss = FAISS.from_documents(documents=split_documents, embedding=embeddings).as_retriever()
kiwi_bm25_faiss = EnsembleRetriever(
retrievers=[kiwi_bm25 , faiss], # 사용할 검색 모델의 리스트
weights=[0.5, 0.5], # 각 검색 모델의 결과에 적용할 가중치
search_type="mmr", # 검색 결과의 다양성을 증진시키는 MMR 방식을 사용
)
# Okt를 사용한 토크나이저 함수
okt = Okt()
def okt_tokenize(text):
return [token for token in okt.morphs(text)]
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
# BM25 Retriever를 사용하여 검색
bm25_results = okt_bm25.get_relevant_documents(question)
bm25_score = okt_bm25.vectorizer.get_scores(okt_tokenize(question))
normalized_scores = softmax(bm25_score)
max_score = max(normalized_scores)
SCORE_THRESHOLD = 0.2
if max_score <= SCORE_THRESHOLD:
print("답변을 찾을 수 없습니다.")
# 명사 추출 함수
def extract_nouns(text):
tokens = kiwi.tokenize(text)
nouns = [token.form for token in tokens if token.tag.startswith('NN')] # 명사(NN)만 추출
filtered_nouns = [noun for noun in nouns if noun not in stop_nouns]
return filtered_nouns
https://github.com/teddylee777/langchain-kr/blob/main/10-Retriever/10-Kiwi-BM25Retriever.ipynb