I had to throw together an implementation of tree-edit distance for clustering web-pages based on their structure. It performs reasonably quickly. The algorithm itself. The repo is here.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
(ns tree-edit-distance.core (:use [clojure.pprint]) (:import (org.htmlcleaner HtmlCleaner DomSerializer CleanerProperties) (org.w3c.dom Document))) (defn init "Perform the correct initialization" [m n c1 c2 del-cost ins-cost] (let [M (make-array Integer/TYPE (inc m) (inc n))] (do (doseq [i (range (inc m)) j (range (inc n))] (aset M i j (int (+ (* del-cost c1) (* ins-cost c2))))) M))) (defn num-children "Expects a html tree" [a-tree] (if(.hasChildNodes a-tree) (.getLength (.getChildNodes a-tree)) 0)) (defn tree-children "Return level 1 children" [a-tree] (let [n (num-children a-tree) cs (.getChildNodes a-tree)] (map #(.item cs %) (range n)))) (defn tree-descendants [a-tree] (if (.hasChildNodes a-tree) (concat (tree-children a-tree) (flatten (map tree-descendants (tree-children a-tree)))) [])) (declare rtdm-edit-distance) (defn invert-cost [t1 t2 del-cost ins-cost sub-cost] (let [t1-desc (tree-descendants t1) t2-desc (tree-descendants t2)] (- (+ (* del-cost (count t1-desc)) (* ins-cost (count t2-desc))) (rtdm-edit-distance t1 t2 del-cost ins-cost sub-cost)))) (defn rtdm-edit-distance "The RTDM algorithm for computing edit-distance. The trees are assumed to be org.w3c.dom.Documents" [tree-1 tree-2 del-cost ins-cost sub-cost] (let [m (num-children tree-1) n (num-children tree-2) t1-children (tree-children tree-1) t2-children (tree-children tree-2) t1-desc (tree-descendants tree-1) t2-desc (tree-descendants tree-2) M (init m n (count t1-desc) (count t2-desc) del-cost ins-cost)] (doseq [i (range m) j (range n)] (let [c-i (nth t1-children i) c-j (nth t2-children j) c-i-desc (tree-descendants c-i) c-j-desc (tree-descendants c-j) del (aget M i (inc j)) ins (aget M (inc i) j) sub-i (- (aget M i j) del-cost ins-cost) sub (if (.isEqualNode c-i c-j) (- sub-i (* ins-cost (count c-j-desc)) (* del-cost (count c-i-desc))) (cond (or (not (.hasChildNodes c-i)) (not (.hasChildNodes c-j))) (+ sub-i sub-cost) (and (= (.getNodeName c-i) (.getNodeName c-j)) (try (= (.getNodeValue (.getNamedItem (.getAttributes c-i) "id")) (.getNodeValue (.getNamedItem (.getAttributes c-j) "id"))) (catch Exception e true)) (try (= (.getNodeValue (.getNamedItem (.getAttributes c-i) "class")) (.getNodeValue (.getNamedItem (.getAttributes c-j) "class"))) (catch Exception e true))) (- sub-i (invert-cost c-i c-j del-cost ins-cost sub-cost)) :else (+ sub-i sub-cost)))] (aset M (inc i) (inc j) (int (min del ins sub))))) (aget M m n))) (defn rtdm-edit-distance-sim [tree-1 tree-2 del-cost ins-cost sub-cost] (let [t1-desc (tree-descendants tree-1) t2-desc (tree-descendants tree-2)] (- 1 (/ (rtdm-edit-distance tree-1 tree-2 del-cost ins-cost sub-cost) (+ (* (+ (count t1-desc) 1) del-cost) (* (+ (count t2-desc) 1) sub-cost)))))) (defn get-xml-tree-body "Downloads a webpage and converts it to an org.w3.dom.Document" [page-src] (let [cleaner (new HtmlCleaner) props (.getProperties cleaner) cleaner-props (new CleanerProperties) dom-serializer (new DomSerializer cleaner-props) tag-node (.clean cleaner page-src)] (.createDOM dom-serializer tag-node))) (defn rtdm-edit-distance-html [pg1 pg2 del-cost ins-cost sub-cost] (rtdm-edit-distance-sim (get-xml-tree-body pg1) (get-xml-tree-body pg2) del-cost ins-cost sub-cost)) |