from pptx import Presentation
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
def pptx_to_faiss_markdown(pptx_path, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
prs = Presentation(pptx_path)
documents = []
for slide_idx, slide in enumerate(prs.slides, start=1):
slide_content = []
for shape in slide.shapes:
# --- Texte normal ---
if hasattr(shape, "text") and not shape.has_table:
text = shape.text.strip()
if text:
slide_content.append(text)
# --- Tableaux en Markdown ---
elif shape.has_table:
# Construire un tableau Markdown
rows = [[cell.text.strip() for cell in row.cells] for row in shape.table.rows]
if rows:
# première ligne = header
header = "| " + " | ".join(rows[0]) + " |"
separator = "| " + " | ".join(["---"] * len(rows[0])) + " |"
body = ["| " + " | ".join(r) + " |" for r in rows[1:]]
table_md = "\n".join([header, separator] + body)
slide_content.append(table_md)
# Ajouter tout le contenu du slide
if slide_content:
markdown_text = f"# Slide {slide_idx}\n\n" + "\n\n".join(slide_content)
documents.append({
"slide": slide_idx,
"content": markdown_text
})
# Embeddings
model = SentenceTransformer(embedding_model)
texts = [doc["content"] for doc in documents]
embeddings = model.encode(texts)
# Index FAISS
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(embeddings))
return index, documents, model
# ==================== USAGE ====================
if __name__ == "__main__":
index, documents, model = pptx_to_faiss_markdown("mon_fichier.pptx")
print(f"Index construit avec {index.ntotal} documents")
query = "chiffres de vente 2024"
q_emb = model.encode([query])
D, I = index.search(np.array([q_emb]), k=3)
print("\nRésultats de recherche :")
for idx in I[0]:
doc = documents[idx]
print(doc["content"])
print("------")