Saturday, October 24, 2009

Write your own search engine using Lucene

Recently, I have been playing around with Lucene (http://incubator.apache.org/lucene.net/. Lucene is an Open Source project, which is sponsored by the Apache foundation, that gives you all the components necessary to create your own search engine.

I downloaded the latest build, which is versioned 2.0.004 and located at http://incubator.apache.org/lucene.net/download/Incubating-Apache-Lucene.Net-2.0-004-11Mar07.bin.zip.

To start off I wrote a small application that indexes all the INF files in my %WINDIR%\System32 directory, and allows me to search their contents.

Here is an example of how the application works:

Search...
You can enter....
filename|fullpath|lastmodified|contents
>> windows fullpath
Query: fullpath:windows
#hits: 8
----------------
Filename: homepage.inf
FullPath: c:\windows\system32\homepage.inf
Last-Modified: 8/10/2004 3:00:00 AM
----------------
Filename: ieuinit.inf
FullPath: c:\windows\system32\ieuinit.inf
Last-Modified: 6/29/2009 1:40:16 AM
----------------
Filename: mapisvc.inf
FullPath: c:\windows\system32\mapisvc.inf
Last-Modified: 4/14/2006 11:39:08 PM
----------------
Filename: mmdriver.inf
FullPath: c:\windows\system32\mmdriver.inf
Last-Modified: 8/10/2004 3:00:00 AM
----------------
Filename: msxmlx.inf
FullPath: c:\windows\system32\msxmlx.inf
Last-Modified: 8/6/2003 10:15:48 AM
----------------
Filename: pid.inf
FullPath: c:\windows\system32\pid.inf
Last-Modified: 6/20/2007 10:52:36 PM
----------------
Filename: $ncsp$.inf
FullPath: c:\windows\system32\$ncsp$.inf
Last-Modified: 9/23/2005 6:50:22 AM
----------------
Filename: $winnt$.inf
FullPath: c:\windows\system32\$winnt$.inf
Last-Modified: 9/27/2005 8:46:09 PM
Search...
You can enter....
filename|fullpath|lastmodified|contents
>> msxml contents
Query: contents:msxml
#hits: 1
----------------
Filename: msxmlx.inf
FullPath: c:\windows\system32\msxmlx.inf
Last-Modified: 8/6/2003 10:15:48 AM
Search...
You can enter....
filename|fullpath|lastmodified|contents
>>

Here is the code for the application.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using sdir = System.IO.Directory;
using LDirectory = Lucene.Net.Store.Directory;
using System.IO;

namespace searchtest
{
    class Program
    {
        static void Main(string[] args)
        {
            Analyzer analyzer = new StandardAnalyzer();
            LDirectory directory = FSDirectory.GetDirectory("/index.bin", true);
            //Directory directory = new RAMDirectory();
            IndexWriter writer = new IndexWriter(directory, analyzer, true);
            writer.SetMaxFieldLength(25000);

            String [] infs = sdir.GetFiles(@"c:\windows\system32", "*.inf");
            foreach (String inf in infs)
            {
                FileInfo fi = new FileInfo(inf);
                Document doc = new Document();
                doc.Add(new Field("filename", fi.Name, Field.Store.YES, Field.Index.TOKENIZED));
                doc.Add(new Field("fullpath", fi.FullName, Field.Store.YES, Field.Index.TOKENIZED));
                doc.Add(new Field("lastmodified", DateField.DateToString(fi.LastWriteTimeUtc), Field.Store.YES, Field.Index.TOKENIZED));

                using (StreamReader sr = new StreamReader(inf))
                {
                    String text = sr.ReadToEnd();
                    doc.Add(new Field("contents", text, Field.Store.YES,
                        Field.Index.TOKENIZED));
                    writer.AddDocument(doc);
                }
            }

            writer.Close();

            // Now search the index:
            IndexSearcher isearcher = new IndexSearcher(directory);

            while (true)
            {
                Console.WriteLine("Search...");
                Console.WriteLine("You can enter....");
                Console.WriteLine("filename|fullpath|lastmodified|contents");

                Console.Write(">> ");
                String cmd = Console.ReadLine();

                if (cmd == null || cmd.Length == 0)
                    break;

                String fieldname = "contents";
                String predicate = null;

                if (cmd.StartsWith("!"))
                {
                    int index = cmd.LastIndexOf("\"");
                    predicate = cmd.Substring(2, index-2);
                    fieldname = cmd.Substring(index + 1);
                }
                else if (cmd.StartsWith("\""))
                {
                    int index = cmd.LastIndexOf("\"");
                    if (index != -1)
                    {
                        predicate = cmd.Substring(1, index-1);
                        if (++index < cmd.Length)
                        {
                            fieldname = cmd.Substring(index);
                        }
                    }
                }
                else
                {
                    String[] tokens = cmd.Split();
                    if (tokens.Length == 2)
                    {
                        predicate = tokens[0];
                        fieldname = tokens[1];
                    }
                    else if (tokens.Length == 1)
                    {
                        predicate = tokens[0];
                    }
                    else
                    {
                        Console.WriteLine("ERROR:");
                        continue;
                    }
                }

                // Parse a simple query that searches for "text":
                QueryParser parser = new QueryParser(fieldname, analyzer);
                Query query = null;

                try
                {
                    query = parser.Parse(predicate);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                    continue;
                }


                Hits hits = isearcher.Search(query);
                Console.WriteLine("Query: {0}", query.ToString());
                Console.WriteLine("#hits: {0}", hits.Length());
                // Iterate through the results:
                for (int i = 0; i < hits.Length(); i++)
                {
                    Console.WriteLine("----------------");
                    Document hitDoc = hits.Doc(i);
                    Console.WriteLine("Filename: {0}", hitDoc.Get("filename"));
                    Console.WriteLine("FullPath: {0}", hitDoc.Get("fullpath"));
                    Field f = hitDoc.GetField("lastmodified");
                    Console.WriteLine("Last-Modified: {0}", DateField.StringToDate(hitDoc.Get("lastmodified")));
                    //Console.WriteLine(hitDoc.Get("contents"));
                }
            }
            isearcher.Close();
            directory.Close(); 
        }
    }
}

2 comments :

  1. Get the 2.31 or later from the CVS and build the lucene libraries and you will get a considerable speed increase and even greater flexibility.

    If you want the manual on lucene, go to http://www.manning.com/hatcher3

    ReplyDelete
  2. Thanks for the info. I will give that a try.

    ReplyDelete