Examples

This section provides practical examples of using YARP for different scenarios.

Document Search System

Building a simple document search system:

from yarp.vector_index import LocalMemoryIndex
from yarp.exceptions.runtime import EmbeddingProviderNotFoundException
import json

# Load documents from a JSON file
def load_documents(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return [doc['content'] for doc in data['documents']]

# Create search system
class DocumentSearcher:
    def __init__(self, documents, model_name="all-MiniLM-L6-v2"):
        self.index = LocalMemoryIndex(documents, model_name)
        self.index.process()

    def search(self, query, max_results=5):
        results = self.index.query(query, top_k=max_results)
        return [(r.document, r.matching_score) for r in results]

    def add_document(self, document):
        self.index.add(document)

    def save(self, path):
        self.index.backup(path)

    @classmethod
    def load(cls, path):
        instance = cls.__new__(cls)
        instance.index = LocalMemoryIndex.load(path)
        return instance

# Usage
documents = load_documents('my_documents.json')
try:
    searcher = DocumentSearcher(documents)
    results = searcher.search("machine learning algorithms")
    for doc, score in results:
        print(f"[{score:.1f}] {doc[:100]}...")
except EmbeddingProviderNotFoundException as e:
    print(f"Missing dependency: {e}")

FAQ System

Creating an intelligent FAQ system:

from yarp.vector_index import LocalMemoryIndex

class FAQSystem:
    def __init__(self):
        # FAQ data: questions and their answers
        self.faqs = {
            "How do I install the software?": "Run 'pip install our-software' to install.",
            "What are the system requirements?": "Python 3.8+, 4GB RAM minimum.",
            "How do I reset my password?": "Click 'Forgot Password' on the login page.",
            "Where can I find the documentation?": "Visit our website's docs section.",
            "How do I contact support?": "Email us at support@example.com",
        }

        # Create index from questions
        questions = list(self.faqs.keys())
        self.index = LocalMemoryIndex(questions)
        self.index.process()

    def ask(self, user_question, threshold=30.0):
        results = self.index.query(user_question, top_k=3)

        best_matches = []
        for result in results:
            if result.matching_score >= threshold:
                question = result.document
                answer = self.faqs[question]
                confidence = result.matching_score
                best_matches.append((question, answer, confidence))

        return best_matches

    def add_faq(self, question, answer):
        self.faqs[question] = answer
        self.index.add(question)

# Usage
faq = FAQSystem()

user_query = "how to install this program?"
matches = faq.ask(user_query)

if matches:
    for question, answer, confidence in matches:
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"Confidence: {confidence:.1f}%\n")
else:
    print("No matching FAQ found. Please contact support.")

Content Recommendation

Building a content recommendation system:

from yarp.vector_index import LocalMemoryIndex
from typing import List, Dict

class ContentRecommender:
    def __init__(self, content_items: List[Dict]):
        """
        content_items: List of dicts with 'id', 'title', 'description', 'tags'
        """
        self.items = {item['id']: item for item in content_items}

        # Create searchable text from title, description, and tags
        self.searchable_texts = []
        self.item_ids = []

        for item in content_items:
            search_text = f"{item['title']} {item['description']} {' '.join(item['tags'])}"
            self.searchable_texts.append(search_text)
            self.item_ids.append(item['id'])

        # Build index
        self.index = LocalMemoryIndex(self.searchable_texts)
        self.index.process()

    def recommend_by_interest(self, interests: str, count: int = 5):
        """Recommend content based on user interests"""
        results = self.index.query(interests, top_k=count)

        recommendations = []
        for result in results:
            # Find the corresponding item
            text_index = self.searchable_texts.index(result.document)
            item_id = self.item_ids[text_index]
            item = self.items[item_id]

            recommendations.append({
                'item': item,
                'relevance': result.matching_score
            })

        return recommendations

    def find_similar_content(self, item_id: str, count: int = 5):
        """Find content similar to a specific item"""
        if item_id not in self.items:
            return []

        item = self.items[item_id]
        query = f"{item['title']} {item['description']} {' '.join(item['tags'])}"

        results = self.index.query(query, top_k=count + 1)  # +1 to exclude self

        similar_items = []
        for result in results:
            text_index = self.searchable_texts.index(result.document)
            similar_id = self.item_ids[text_index]

            # Skip the item itself
            if similar_id != item_id:
                similar_items.append({
                    'item': self.items[similar_id],
                    'similarity': result.matching_score
                })

        return similar_items[:count]

# Usage
content = [
    {
        'id': '1',
        'title': 'Introduction to Machine Learning',
        'description': 'Learn the basics of ML algorithms and applications',
        'tags': ['ml', 'python', 'beginner', 'tutorial']
    },
    {
        'id': '2',
        'title': 'Advanced Deep Learning',
        'description': 'Dive deep into neural networks and deep learning',
        'tags': ['deep-learning', 'neural-networks', 'advanced', 'ai']
    },
    {
        'id': '3',
        'title': 'Python for Data Science',
        'description': 'Using Python libraries for data analysis',
        'tags': ['python', 'data-science', 'pandas', 'numpy']
    }
]

recommender = ContentRecommender(content)

# Get recommendations based on interests
recommendations = recommender.recommend_by_interest("machine learning python")
for rec in recommendations:
    print(f"Title: {rec['item']['title']}")
    print(f"Relevance: {rec['relevance']:.1f}%")
    print(f"Description: {rec['item']['description']}\n")

Performance Optimization Example

Optimizing YARP for large datasets:

from yarp.vector_index import LocalMemoryIndex
import time

class OptimizedIndex:
    def __init__(self, documents, model_name="all-MiniLM-L6-v2"):
        self.index = LocalMemoryIndex(documents, model_name)

    def build_optimized(self, num_trees=512):
        """Build index with more trees for better accuracy"""
        start_time = time.time()
        self.index.process(num_trees=num_trees)
        build_time = time.time() - start_time
        print(f"Index built in {build_time:.2f} seconds with {num_trees} trees")

    def benchmark_search(self, queries, search_k_values=[50, 100, 200]):
        """Benchmark different search_k values"""
        results = {}

        for search_k in search_k_values:
            total_time = 0
            for query in queries:
                start_time = time.time()
                self.index.query(query, search_k=search_k, top_k=5)
                total_time += time.time() - start_time

            avg_time = total_time / len(queries)
            results[search_k] = avg_time
            print(f"search_k={search_k}: {avg_time*1000:.2f}ms per query")

        return results

    def memory_efficient_batch_add(self, documents, batch_size=1000):
        """Add documents in batches to manage memory usage"""
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            self.index.add(batch)
            print(f"Added batch {i//batch_size + 1}, total docs: {len(self.index.documents)}")

# Usage for large datasets
large_documents = [f"Document {i} with content..." for i in range(10000)]

opt_index = OptimizedIndex(large_documents[:5000])  # Start with subset
opt_index.build_optimized(num_trees=256)

# Add remaining documents in batches
opt_index.memory_efficient_batch_add(large_documents[5000:], batch_size=500)

# Benchmark performance
test_queries = ["content search", "document retrieval", "information finder"]
opt_index.benchmark_search(test_queries)