Whistlepig is a lightweight real-time search engine written in ANSI C. (description and source) I heard about it when Don Metzler plugged it in an answer he wrote on quora. In this post, with very little code, I was able to build an index, query it and write a servlet that talks to the index using the FFI.
Whistlepig is implemented in less than 3000 lines of ANSI C. It comes with a very decent query language and returns documents (that match a query) sorted by their time-of-insertion (into the index).
For using the FFI, we need a shared library. I had to modify the Makefile slightly to make it compile on OS X (you can download this from my fork here). I was able to build it on Linux with ease though. You can build the .so
(.dylib
) file using the following command:
gcc --shared -o libwhistlepig.so entry.o file-indexer.o index.o lock.o query-parser.o query-parser.lex.o query-parser.tab.o query.o search.o segment.o snippeter.o stringmap.o stringpool.o termhash.o tokenizer.lex.o mmap-obj.o error.o |
On OS X, you will need to replace the .so
with .dylib
.
In order for the generated libwhistlepig.so
file to be picked up the Racket FFI, you will need to add the directory where it resides to the environment variable LD_LIBRARY_PATH
.
Next, we need to use the FFI and write racket functions that call the corresponding Whistlepig routines. This file is sufficient to wrap around all this routines we will need to use.
#lang racket/base | |
(require ffi/unsafe | |
ffi/unsafe/define) | |
(require racket/stream) | |
;;;; Test if we can build/search from whistlepig | |
;; reference the whistlepig shared object | |
(define-ffi-definer define-whistlepig (ffi-lib "libwhistlepig")) | |
;; stdio functions | |
(define-whistlepig fopen (_fun _string _string -> _pointer)) ;; fopen | |
(define-whistlepig fclose (_fun _pointer -> _void)) | |
;; whistlepig structs | |
(define-cstruct _mmap_obj ([fd _int] | |
[loaded_size _uint32] | |
[content _pointer])) | |
(define-cstruct _wp_index ([pathname_base _pointer] | |
[num_segments _uint16] | |
[sizeof_segments _uint16] | |
[docid_offsets _pointer] | |
[segments _pointer] | |
[open _uint8] | |
[indexinfo _mmap_obj])) | |
(define-cstruct _wp_entry ([khash_t _pointer] | |
[next_offset _long])) | |
(define-cstruct _wp_query ([type _uint8] | |
[field _pointer] | |
[word _pointer] | |
[num_children _uint16] | |
[children _pointer] | |
[next _pointer] | |
[last _pointer] | |
[segment_idx _uint16] | |
[search_data _pointer])) | |
(define-cstruct _wp_result ([res1 _uint64] | |
[res2 _uint64] | |
[res3 _uint64])) | |
(define-cstruct _wp_error ([type _byte] | |
[size _int] | |
[msg _pointer] | |
[srcs _pointer])) | |
;; whistlepig functions | |
(define-whistlepig wp_entry_new (_fun -> _wp_entry-pointer)) | |
(define-whistlepig wp_entry_add_file (_fun _wp_entry-pointer | |
_string _pointer -> _pointer)) | |
(define-whistlepig | |
wp_index_add_entry | |
(_fun _wp_index-pointer | |
_wp_entry-pointer | |
[r : (_ptr o _uint64)] -> [n : _pointer] -> (and (not n) r))) | |
(define-whistlepig wp_entry_free (_fun _wp_entry-pointer -> _pointer)) | |
(define-whistlepig wp_index_exists (_fun _string -> _int)) | |
(define-whistlepig | |
wp_index_create | |
(_fun [m : (_ptr o (_ptr o _wp_index))] | |
_string -> [n : _pointer] -> (if (not n) m n))) | |
(define-whistlepig | |
wp_index_load | |
(_fun [i : (_ptr o (_ptr o _wp_index))] | |
_string | |
-> (p : _pointer) | |
-> (and (not p) i))) | |
(define-whistlepig | |
wp_query_parse | |
(_fun _string | |
_string | |
[q : (_ptr o (_ptr o _wp_query))] | |
-> [e : _pointer] | |
-> (and (not e) q))) | |
(define-whistlepig | |
wp_query_to_s | |
(_fun [_ptr i _wp_query] _int _pointer -> _int)) | |
(define-whistlepig | |
wp_index_setup_query | |
(_fun (_ptr i _wp_index) | |
[p : (_ptr i _wp_query)] | |
-> [r : _pointer] | |
-> (and (not r) (cast p _pointer _wp_query-pointer)))) | |
(define-whistlepig | |
wp_index_run_query | |
(_fun (_ptr i _wp_index) | |
(_ptr i _wp_query) | |
_int | |
[n : (_ptr o _uint32)] | |
[r : _pointer] | |
-> [s : _pointer] | |
-> (and (not s) n))) | |
(define-whistlepig | |
wp_index_count_results | |
(_fun (_ptr i _wp_index) | |
(_ptr i _wp_query) | |
[n : (_ptr o _int)] | |
-> [r : _pointer] | |
-> (and (not r) n))) | |
(define *field-name* "body") | |
;; Export functions | |
(define (wp-entry-new) | |
(wp_entry_new)) | |
(define (wp-index-create index-name) | |
(wp_index_create index-name)) | |
(define (wp-entry-add-file entry field file-pointer) | |
(wp_entry_add_file entry field file-pointer)) | |
(define (wp-index-add-entry index entry) | |
(wp_index_add_entry index entry)) | |
(define (wp-entry-free entry) | |
(wp_entry_free entry)) | |
(define (wp-index-load index-path) | |
(wp_index_load index-path)) | |
(define (wp-query-parse query field) | |
(wp_query_parse query field)) | |
(define (wp-query-to-s query) | |
(let* ((size 1024) | |
(buffer (malloc 'atomic size)) | |
(to-ret '*)) | |
(begin | |
(memset buffer 0 size) | |
(wp_query_to_s query size buffer) | |
(set! to-ret (cast buffer _pointer _string))) | |
to-ret)) | |
(define (wp-index-setup-query index query) | |
(wp_index_setup_query index query)) | |
(define (wp-index-exists index-base-path) | |
(not (= 0 (wp_index_exists index-base-path)))) | |
(define (wp-index-run-query index query results-to-show) | |
(let* ((size (* results-to-show 8)) | |
(buffer (malloc 'atomic size)) ; allocate | |
(num-res '*)) | |
(begin | |
(memset buffer 0 size) | |
(set! num-res (wp_index_run_query index query results-to-show buffer))) | |
(list num-res (res-buf->arr buffer num-res _uint64)))) | |
(define (res-buf->arr res-buf num-res ptr-type) | |
(map (lambda (i) (ptr-ref res-buf ptr-type i)) | |
(stream->list (in-range 0 num-res)))) | |
(define (file-close f) | |
(fclose f)) | |
(provide (all-defined-out)) |
Let us now test our implementation. Whistlepig itself ships with two programs: add
and query
. add
adds a bunch of files specified on the command line to a new index. We can replicate the functionality in add.rkt
:
#lang racket | |
(require racket/file) | |
(require srfi/13) ; string search methods | |
(require srfi/14) ; charset library | |
(require "racket-whistlepig.rkt") | |
;; Create a new index in /tmp (so this is very much unix only) | |
;; and add all the racket source files in it | |
(define (add-files index-prefix files) | |
(define index (wp-index-create index-prefix)) | |
(map | |
(lambda (file-path) | |
(let ((entry (wp-entry-new)) | |
(f (fopen file-path "r"))) | |
(begin | |
(wp-entry-add-file entry "body" f) | |
(wp-index-add-entry index entry) | |
(wp-entry-free entry) | |
(fclose f)))) | |
files)) | |
(define (main) | |
(match-define | |
(list* index-prefix documents+) | |
(command-line | |
#:program "add" | |
#:args stuff | |
stuff)) | |
(begin | |
(add-files index-prefix documents+) | |
(map | |
(lambda (d) (printf "~s added\n" d)) | |
documents+)) | |
(printf "Done adding\n")) | |
(main) |
interactive.rkt
takes and index location and interactively runs queries against it and returns doc-ids. This is interactive.rkt
.
#lang racket | |
(require racket/cmdline) | |
(require "racket-whistlepig.rkt") | |
(define (test-index-load) | |
(wp-index-load "/tmp/index")) | |
(define (query index q num-res) | |
(define query (wp-query-parse q "body")) | |
(define query2 (wp-index-setup-query index query)) | |
(wp-index-run-query index query2 num-res)) | |
(define (main) | |
(define (inner index) | |
(printf "query: ") | |
(let ((q (read-line (current-input-port)))) | |
(begin | |
(map | |
(lambda (x) (printf "found doc ~a\n" x)) | |
(second (query index q 100))) | |
(inner index)))) | |
(let* ((index-prefix (command-line | |
#:program "query" | |
#:args (index-prefix) | |
index-prefix)) | |
(index (wp-index-load index-prefix))) | |
(inner index))) | |
(main) |
Now, the next step is obviously very straightforward. I wanted a quick way to get set up and running. We don’t have things like stored-fields in Lucene so, we need an external map to doc-ids. I am using a file that contains a list of s-expressions
that look like this:
(doc-id doc-path doc-title doc-link)
On my laptop (not accessible from the public web), the file looks like:
((1 "/Users/shriphani/Documents/blog/2013/07/13/new-blog/index.html" "New Blog" "http://blog.shriphani.com/2013/07/13/new-blog/") | |
(2 "/Users/shriphani/Documents/blog/2013/07/13/fast-dates-parser/index.html" "Fast Dates Parser" "http://blog.shriphani.com/2013/07/13/fast-dates-parser/") | |
(3 "/Users/shriphani/Documents/blog/2013/07/21/accessing-your-kindle-highlights/index.html" "Accessing Your Kindle Highlights" "http://blog.shriphani.com/2013/07/21/accessing-your-kindle-highlights/") | |
(4 "/Users/shriphani/Documents/blog/2013/07/21/pittsburgh-vintage-grand-prix-italian-cars/index.html" "Pittsburgh Vintage Grand Prix - Italian Cars" "http://blog.shriphani.com/2013/07/21/pittsburgh-vintage-grand-prix-italian-cars") | |
(5 "/Users/shriphani/Documents/blog/2013/07/22/the-percolator-paper/index.html" "The Percolator Paper" "http://blog.shriphani.com/2013/07/24/the-percolator-paper/") | |
(6 "/Users/shriphani/Documents/blog/2013/08/16/clueweb12-status-report/index.html" "Clueweb12++ Status Report" "http://blog.shriphani.com/2013/08/16/clueweb12-status-report/") | |
(7 "/Users/shriphani/Documents/blog/2013/08/27/racket-whistlepig-bindings/index.html" "Racket Whistlepig Bindings" "http://blog.shriphani.com/2013/08/27/racket-whistlepig-bindings/")) |
So, our search engine will start, load the documents and add them to the index in the order specified.
When a query comes along, we get the doc-ids from the engine and line them up with the document title (and thus we get the doc-link that can be rendered).
We accept queries using a URL of the form http://domain/search?q=query
. This tiny servlet accomplishes that:
#lang racket | |
(require racket/cmdline | |
rackjure | |
net/url | |
web-server/http | |
web-server/servlet | |
web-server/servlet-env) | |
(require "racket-whistlepig.rkt") | |
(define index '*) | |
(define docs-ids-info '*) | |
(define (start req) | |
(response/xexpr | |
`(html (head (title "Shriphani's Blog Search")) | |
(body | |
(p | |
(form ((action "") (method "get")) | |
(input ((type "text") (name "q"))) | |
(input ((type "submit") (style "display:none"))))) | |
(p | |
,@(map render-result (process-url (request-uri req)))))))) | |
(define (process-url url) | |
(let ((query-keyword | |
(filter | |
(lambda (q) (equal? (car q) 'q)) | |
(url-query url)))) | |
(if (null? query-keyword) | |
'() | |
(search-index (~> query-keyword | |
car | |
cdr))))) | |
(define (initialize doc-ids-info index-prefix) | |
(begin | |
(set! docs-ids-info (read (open-input-file doc-ids-info))) | |
(set! index (wp-index-create index-prefix)) | |
(map add-document docs-ids-info))) | |
(define (add-document doc-id-info) | |
(let ((f (fopen (second doc-id-info) "r")) | |
(entry (wp-entry-new))) | |
(begin (wp-entry-add-file entry "body" f) | |
(wp-index-add-entry index entry) | |
(wp-entry-free entry) | |
(fclose f)))) | |
(define (render-result result) | |
`(p (h2 (a ((href ,(second result))) ,(first result))))) | |
(define (search-index kw) | |
(begin | |
(define query (wp-query-parse kw "body")) | |
(define query2 (wp-index-setup-query index query)) | |
(map | |
(lambda (d) (let ((info (assv d docs-ids-info))) | |
(list (third info) (fourth info)))) | |
(second (wp-index-run-query index query2 10))))) | |
(define (main) | |
(command-line #:program "whistlepig-servlet" | |
#:args (doc-ids-info index-prefix) | |
(initialize doc-ids-info index-prefix)) | |
(serve/servlet start #:servlet-path "/search")) | |
(main) | |
You can try it out here : http://blog.shriphani.com/search?q=hello. It is very clunky and only indexes my blog’s post pages. . It won’t generate snippets (the postings list doesn’t store token positions).
To make this really work, I will need to integrate it with Frog and flesh the UI out. This was something I threw together quickly so I could build on it later. It ended up being a good excuse to continue using the FFI. The whistlepig bindings are quite sparse too and can use some work.
The full code is available in this Github repository.