Configuring and Consuming Fuzzy Search in .NET with Elasticsearch
mohamed Tayel
Posted on August 20, 2024
Introduction
This comprehensive guide covers everything you need to know about integrating Elasticsearch with a .NET application to perform fuzzy searches on large datasets, including Arabic and English names. We’ll walk through configuring Elasticsearch, reading data from Excel, indexing it, and performing fuzzy searches. Whether you're dealing with large datasets or need to support multiple languages, this guide has you covered.
Step 1: Configuring Elasticsearch
Before diving into the code, let's start with the necessary Elasticsearch configurations to support fuzzy search, handle large datasets, and ensure accurate Arabic text processing.
1.1 Increasing Index Size
To accommodate large datasets and prevent errors related to maxClauseCount
, we need to increase the default index size and query settings in Elasticsearch.
-
Update
maxClauseCount
to Handle Large Queries:
PUT /_cluster/settings
{
"persistent": {
"indices.query.bool.max_clause_count": 10000
}
}
This configuration change increases the limit for the number of boolean clauses allowed in a single query, ensuring that large batch queries don’t fail.
- Optimize Index Settings:
You can also adjust index settings for better performance:
PUT /my_test_index/_settings
{
"index": {
"max_result_window": 50000
}
}
This increases the max_result_window
, allowing more results to be fetched for large datasets.
1.2 Supporting Arabic Text with Custom Analyzer
To accurately search Arabic text using fuzzy logic, we need to configure a custom Arabic analyzer:
PUT /my_test_index
{
"settings": {
"analysis": {
"analyzer": {
"my_arabic_analyzer": {
"type": "arabic"
}
}
}
},
"mappings": {
"properties": {
"full_name_ar": {
"type": "text",
"analyzer": "my_arabic_analyzer"
}
}
}
}
This setup ensures that Arabic text is properly tokenized and processed, making the fuzzy search more accurate and reliable.
Step 2: Full Code Implementation
Now that Elasticsearch is configured, let's dive into the full code implementation. The code reads data from an Excel file, indexes it into Elasticsearch, and performs fuzzy searches for both Arabic and English names.
using Elasticsearch.Net;
using Nest;
using OfficeOpenXml;
using System;
using System.Configuration;
using Newtonsoft.Json;
using System.Text;
public class Person
{
[Text(Name = "full_name_ar")]
public string FullName { get; set; }
[Text(Name = "full_name_en")]
public string FullNameEn { get; set; }
}
public class SearchResult
{
public string FullName { get; set; }
public string FullNameEn { get; set; }
public double MatchingScore { get; set; }
public bool ExactMatching { get; set; }
}
class Program
{
static void Main(string[] args)
{
// Set console to UTF-8 to handle Arabic characters
Console.OutputEncoding = Encoding.UTF8;
// Read from App.config
string elasticUri = ConfigurationManager.AppSettings["ElasticsearchUri"];
string username = ConfigurationManager.AppSettings["ElasticsearchUsername"];
string password = ConfigurationManager.AppSettings["ElasticsearchPassword"];
string excelFilePath = ConfigurationManager.AppSettings["excelFilePath"];
// Set up the Elasticsearch client
var settings = new ConnectionSettings(new Uri(elasticUri))
.DefaultIndex("my_test_index")
.BasicAuthentication(username, password)
.DisableDirectStreaming(true)
.ServerCertificateValidationCallback(CertificateValidations.AllowAll);
var client = new ElasticClient(settings);
// Verify if the index exists
var existsResponse = client.Indices.Exists("my_test_index");
if (!existsResponse.Exists)
{
Console.WriteLine("The specified index does not exist.");
return;
}
// Load names from Excel
var names = LoadNamesFromExcel(excelFilePath);
// Index the names into Elasticsearch
IndexNames(client, names);
// Name to search for using fuzzy search
string searchNameAr = @"احمد محمد محمد";
string searchName = @"John John ";
// Perform a fuzzy search and generate the response JSON
var jsonResponse = PerformFuzzySearchEnglish(client, names, searchName);
var jsonResponse1 = PerformFuzzyArabic(client, names, searchNameAr);
// Output the JSON response
Console.WriteLine("-------------------------English search -----------------------------------");
Console.WriteLine(jsonResponse);
Console.WriteLine("------------------------------Arabic search------------------------------");
Console.WriteLine(jsonResponse1);
}
static string PerformFuzzySearchEnglish(ElasticClient client, List<(string FullNameAr, string FullNameEn)> names, string searchName)
{
var allResults = new List<SearchResult>();
int batchSize = 1000;
var nameParts = searchName.Split(' '); // Split the name into components
foreach (var batch in Batch(names, batchSize))
{
var searchResponse = client.Search<Person>(s => s
.Query(q => q
.Bool(b => b
.Must(nameParts.Select(part => (Func<QueryContainerDescriptor<Person>, QueryContainer>)(mp => mp
.Bool(batchBool => batchBool
.Should(batch.Select(name => (Func<QueryContainerDescriptor<Person>, QueryContainer>)(f => f
.Fuzzy(fz => fz
.Field(p => p.FullNameEn)
.Value(part)
.Fuzziness(Fuzziness.EditDistance(2))
)
)).ToArray())
)
)).ToArray())
)
)
);
if (searchResponse.IsValid)
{
foreach (var hit in searchResponse.Hits)
{
var exactMatch = hit.Source.FullNameEn == searchName;
var matchingScore = hit.Score ?? 0;
allResults.Add(new SearchResult
{
FullName = hit.Source.FullName,
FullNameEn = hit.Source.FullNameEn,
MatchingScore = matchingScore,
ExactMatching = exactMatch
});
}
}
else
{
Console.WriteLine($"Batch search failed: {searchResponse.ServerError?.Error?.Reason}");
}
}
return JsonConvert.SerializeObject(allResults, Formatting.Indented);
}
static string PerformFuzzyArabic(ElasticClient client, List<(string FullNameAr, string FullNameEn)> names, string searchName)
{
var searchResponse = client.Search<Person>(s => s
.Index("my_test_index")
.Query(q => q
.Match(m => m
.Field(p => p.FullName) // Ensure this is mapped with the Arabic analyzer
.Query(searchName)
.Analyzer("my_arabic_analyzer")
)
)
);
if (!searchResponse.IsValid)
{
Console.WriteLine($"Search failed: {searchResponse.ServerError?.Error?.Reason}");
return string.Empty;
}
var results = searchResponse.Hits.Select(hit => new SearchResult
{
FullName = hit.Source.FullName,
FullNameEn = hit.Source.FullNameEn,
MatchingScore = hit.Score ?? 0,
ExactMatching = hit.Source.FullName == searchName
}).ToList();
return JsonConvert.SerializeObject(results, Formatting.Indented);
}
static List<(string FullNameAr, string FullNameEn)> LoadNamesFromExcel(string filePath)
{
var names = new List<(string FullNameAr, string FullNameEn)>();
ExcelPackage.LicenseContext = LicenseContext.NonCommercial;
using (var package = new ExcelPackage(new FileInfo(filePath)))
{
ExcelWorksheet worksheet = package.Workbook.Worksheets[0];
int rowCount = worksheet.Dimension.Rows;
for (int row = 2; row <= rowCount; row++) // Start from row 2 to skip header
{
string fullNameAr = worksheet.Cells[row, 1].Text;
string fullNameEn = worksheet.Cells[row, 2].Text;
if (!string.IsNullOrEmpty(fullNameAr) && !string.IsNullOrEmpty(fullNameEn))
{
names.Add((fullNameAr, fullNameEn));
}
}
}
return names;
}
static void IndexNames(ElasticClient client, List<(string FullNameAr, string FullNameEn)> names)
{
foreach (var name in names)
{
var person = new Person
{
FullName = name.FullNameAr,
FullNameEn = name.FullNameEn
};
var indexResponse = client.IndexDocument(person);
if (!indexResponse.IsValid)
{
Console.WriteLine($"Failed to index: {name.FullNameAr} - Reason: {indexResponse.ServerError.Error.Reason}");
}
}
}
public static IEnumerable<List<T>> Batch<T>(IEnumerable<T> source, int batchSize)
{
var batch = new List<T>(batchSize);
foreach (var item in source)
{
batch.Add(item);
if (batch.Count == batchSize)
{
yield return batch;
batch = new List<T>(batchSize);
}
}
if (batch.Count > 0)
{
yield return batch;
}
}
}
Conclusion
In this guide, we covered everything from configuring Elasticsearch to support large datasets and Arabic text, to reading data from an Excel file and indexing it into Elasticsearch, and finally performing fuzzy searches using .NET . The code provided is comprehensive and ready for immediate use, making it easier to implement fuzzy search in your own projects.
Posted on August 20, 2024
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.