|
|
@@ -0,0 +1,437 @@ |
|
|
module C : sig |
|
|
type t |
|
|
val empty : t |
|
|
val one : char -> t |
|
|
val union : t -> t -> t |
|
|
val inter : t -> t -> t |
|
|
val top : t |
|
|
val mem : char -> t -> bool |
|
|
val make : (char -> bool) -> t |
|
|
val equal : t -> t -> bool |
|
|
val negate : t -> t |
|
|
val is_empty : t -> bool |
|
|
val disjoint : t -> t -> bool |
|
|
val fold : (char -> 'a -> 'a) -> 'a -> t -> 'a |
|
|
val of_string : string -> t |
|
|
end = struct |
|
|
type t = bytes |
|
|
|
|
|
let make f = |
|
|
let open Int in |
|
|
let open Char in |
|
|
Bytes.init 32 (fun i -> |
|
|
let b0 = shift_left (Bool.to_int (f (chr (i * 8 + 0)))) 0 in |
|
|
let b1 = shift_left (Bool.to_int (f (chr (i * 8 + 1)))) 1 in |
|
|
let b2 = shift_left (Bool.to_int (f (chr (i * 8 + 2)))) 2 in |
|
|
let b3 = shift_left (Bool.to_int (f (chr (i * 8 + 3)))) 3 in |
|
|
let b4 = shift_left (Bool.to_int (f (chr (i * 8 + 4)))) 4 in |
|
|
let b5 = shift_left (Bool.to_int (f (chr (i * 8 + 5)))) 5 in |
|
|
let b6 = shift_left (Bool.to_int (f (chr (i * 8 + 6)))) 6 in |
|
|
let b7 = shift_left (Bool.to_int (f (chr (i * 8 + 7)))) 7 in |
|
|
let (||) = logor in |
|
|
Char.chr (b7 || b6 || b5 || b4 || b3 || b2 || b1 || b0)) |
|
|
|
|
|
let mem c s = |
|
|
let b = (Char.code c) / 8 in |
|
|
let i = (Char.code c) mod 8 in |
|
|
let w = Char.code (Bytes.get s b) in |
|
|
Int.logand (Int.shift_left 1 i) w > 0 |
|
|
|
|
|
let empty = make (fun c -> false) |
|
|
let top = make (fun c -> true) |
|
|
let one c = make (fun c' -> c = c') |
|
|
let union s1 s2 = make (fun c -> mem c s1 || mem c s2) |
|
|
let inter s1 s2 = make (fun c -> mem c s1 && mem c s2) |
|
|
let negate s = make (fun c -> not (mem c s)) |
|
|
|
|
|
let equal s1 s2 = |
|
|
let rec loop i acc = |
|
|
if i = 32 then |
|
|
acc |
|
|
else |
|
|
loop (i+1) (acc && (Bytes.get s1 i = Bytes.get s2 i)) |
|
|
in |
|
|
loop 0 true |
|
|
|
|
|
let is_empty s = equal s empty |
|
|
|
|
|
let disjoint s1 s2 = is_empty (inter s1 s2) |
|
|
|
|
|
let fold f init s = |
|
|
let rec loop i acc = |
|
|
if i > 255 then |
|
|
acc |
|
|
else |
|
|
let c = Char.chr i in |
|
|
if mem c s then |
|
|
loop (i+1) (f c acc) |
|
|
else |
|
|
loop (i+1) acc |
|
|
in |
|
|
loop 0 init |
|
|
|
|
|
let of_string str = |
|
|
let p c = String.contains str c in |
|
|
make p |
|
|
end |
|
|
|
|
|
module Tp : sig |
|
|
type t = { null : bool; first : C.t; follow : C.t } |
|
|
exception TypeError of string |
|
|
val char : char -> t |
|
|
val eps : t |
|
|
val seq : t -> t -> t |
|
|
val charset : C.t -> t |
|
|
val string : string -> t |
|
|
val alt : t -> t -> t |
|
|
val bot : t |
|
|
val equal : t -> t -> bool |
|
|
val fix : (t -> t) -> t |
|
|
val print : Format.formatter -> t -> unit |
|
|
end = struct |
|
|
type t = { |
|
|
null : bool; |
|
|
first : C.t; |
|
|
follow : C.t; |
|
|
} |
|
|
|
|
|
exception TypeError of string |
|
|
|
|
|
let char c = { |
|
|
null = false; |
|
|
first = C.one c; |
|
|
follow = C.empty; |
|
|
} |
|
|
|
|
|
let eps = { |
|
|
null = true; |
|
|
first = C.empty; |
|
|
follow = C.empty; |
|
|
} |
|
|
|
|
|
let seq t1 t2 = |
|
|
let separate t1 t2 = |
|
|
not t1.null |
|
|
&& |
|
|
C.disjoint t1.follow t2.first |
|
|
in |
|
|
if separate t1 t2 then |
|
|
{ null = false; |
|
|
first = t1.first; |
|
|
follow = C.union t2.follow (if t2.null then t1.follow else C.empty); |
|
|
} |
|
|
else |
|
|
raise (TypeError "ambiguous sequencing") |
|
|
|
|
|
let string s = |
|
|
if String.length s = 0 then |
|
|
eps |
|
|
else |
|
|
char s.[0] |
|
|
|
|
|
let alt t1 t2 = |
|
|
let nonoverlapping t1 t2 = |
|
|
not (t1.null && t2.null) |
|
|
&& |
|
|
C.disjoint t1.first t2.first |
|
|
in |
|
|
if nonoverlapping t1 t2 then |
|
|
{ |
|
|
null = t1.null || t2.null; |
|
|
first = C.union t1.first t2.first; |
|
|
follow = C.union t1.follow t2.follow; |
|
|
} |
|
|
else |
|
|
raise (TypeError "ambiguous alternation") |
|
|
|
|
|
let bot = { |
|
|
null = false; |
|
|
first = C.empty; |
|
|
follow = C.empty; |
|
|
} |
|
|
|
|
|
let charset cs = |
|
|
if C.is_empty cs then |
|
|
bot |
|
|
else |
|
|
{ null = false; |
|
|
first = cs; |
|
|
follow = C.empty; |
|
|
} |
|
|
|
|
|
let equal t1 t2 = |
|
|
t1.null = t2.null |
|
|
&& C.equal t1.first t2.first |
|
|
&& C.equal t1.follow t2.follow |
|
|
|
|
|
let fix f = |
|
|
let rec loop t = |
|
|
let t' = f t in |
|
|
if equal t t' then |
|
|
t' |
|
|
else |
|
|
loop t' |
|
|
in |
|
|
loop bot |
|
|
|
|
|
let print out t = |
|
|
let p fmt = Format.fprintf out fmt in |
|
|
let print_set cs = |
|
|
C.fold (fun c () -> p "%c" c) () cs |
|
|
in |
|
|
let print_bool = function |
|
|
| true -> p "true" |
|
|
| false -> p "false" |
|
|
in |
|
|
p "{\n"; |
|
|
p " null = "; print_bool t.null; p ";\n"; |
|
|
p " first = C.of_string \""; print_set t.first; p "\";\n"; |
|
|
p " follow = C.of_string \""; print_set t.follow; p "\";\n"; |
|
|
p "}\n" |
|
|
end |
|
|
|
|
|
module Parser: sig |
|
|
type 'a t |
|
|
exception ParseFailure of int |
|
|
|
|
|
val char : char -> unit t |
|
|
val charset : C.t -> char t |
|
|
val string : string -> unit t |
|
|
|
|
|
val map : ('a -> 'b) -> 'a t -> 'b t |
|
|
val (let+) : 'a t -> ('a -> 'b) -> 'b t |
|
|
|
|
|
val seq : 'a t -> 'b t -> ('a * 'b) t |
|
|
val (and+) : 'a t -> 'b t -> ('a * 'b) t |
|
|
|
|
|
val eps : unit t |
|
|
val return : 'a -> 'a t |
|
|
val fail : 'a t |
|
|
val any : 'a t list -> 'a t |
|
|
val fix : ('a t -> 'a t) -> 'a t |
|
|
|
|
|
val parse : 'a t -> string -> int -> (int * 'a) |
|
|
end = struct |
|
|
type 'a t = { tp : Tp.t; parse : string -> int -> int * 'a } |
|
|
exception ParseFailure of int |
|
|
|
|
|
let char c = |
|
|
let p s i = |
|
|
if i < String.length s && s.[i] = c then |
|
|
(i+1, ()) |
|
|
else |
|
|
raise (ParseFailure i) |
|
|
in |
|
|
{ tp = Tp.char c; parse = p } |
|
|
|
|
|
let (let+) p f = |
|
|
let p' s i = |
|
|
let (i, v) = p.parse s i in |
|
|
(i, f v) |
|
|
in |
|
|
{tp = p.tp; parse = p'} |
|
|
|
|
|
let map f p = let+ x = p in f x |
|
|
|
|
|
let (and+) p1 p2 = |
|
|
let p' s i = |
|
|
let (i, a) = p1.parse s i in |
|
|
let (i, b) = p2.parse s i in |
|
|
(i, (a,b)) |
|
|
in |
|
|
{ tp = Tp.seq p1.tp p2.tp; parse = p' } |
|
|
|
|
|
let seq = (and+) |
|
|
|
|
|
let eps = { tp = Tp.eps; parse = fun s i -> (i, ()) } |
|
|
|
|
|
let return x = |
|
|
let+ () = eps in x |
|
|
|
|
|
let charset cs = |
|
|
let p s i = |
|
|
if i < String.length s && C.mem s.[i] cs then |
|
|
(i+1, s.[i]) |
|
|
else |
|
|
raise (ParseFailure i) |
|
|
in |
|
|
{tp = Tp.charset cs; parse = p } |
|
|
|
|
|
let string str = |
|
|
let p s i = |
|
|
if i + String.length str < String.length s then |
|
|
let rec loop j = |
|
|
if j < String.length str then |
|
|
if str.[j] = s.[i + j] then |
|
|
loop (j+1) |
|
|
else |
|
|
raise (ParseFailure (i+j)) |
|
|
else |
|
|
(i+j, ()) |
|
|
in |
|
|
loop 0 |
|
|
else |
|
|
raise (ParseFailure i) |
|
|
in |
|
|
{tp = Tp.string str; parse = p} |
|
|
|
|
|
let fail = |
|
|
{ tp = Tp.bot; |
|
|
parse = fun s i -> raise (ParseFailure i) } |
|
|
|
|
|
let (||) p1 p2 = |
|
|
let p' s i = |
|
|
if i < String.length s then |
|
|
if C.mem s.[i] p1.tp.Tp.first then |
|
|
p1.parse s i |
|
|
else if C.mem s.[i] p2.tp.Tp.first then |
|
|
p2.parse s i |
|
|
else if p1.tp.Tp.null then |
|
|
p1.parse s i |
|
|
else if p2.tp.Tp.null then |
|
|
p2.parse s i |
|
|
else |
|
|
raise (ParseFailure i) |
|
|
else if p1.tp.Tp.null then |
|
|
p1.parse s i |
|
|
else if p2.tp.Tp.null then |
|
|
p2.parse s i |
|
|
else |
|
|
raise (ParseFailure i) |
|
|
in |
|
|
{tp = Tp.alt p1.tp p2.tp; parse = p' } |
|
|
|
|
|
let any ps = List.fold_left (||) fail ps |
|
|
|
|
|
|
|
|
let fix f = |
|
|
let g t = (f {fail with tp = t}).tp in |
|
|
let r = ref (fail.parse) in |
|
|
let p = f {tp = (Tp.fix g); parse = fun s i -> !r s i} in |
|
|
r := p.parse; |
|
|
p |
|
|
|
|
|
let parse p = p.parse |
|
|
end |
|
|
|
|
|
module Sexp = struct |
|
|
open Parser |
|
|
|
|
|
let letter = charset (C.of_string "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") |
|
|
|
|
|
let digit = charset (C.of_string "0123456789") |
|
|
let whitespace = charset (C.of_string " \t\n") |
|
|
|
|
|
let (==>) p f = let+ x = p in f x |
|
|
|
|
|
let (>>) p1 p2 = |
|
|
let+ x = p1 |
|
|
and+ _ = p2 in |
|
|
x |
|
|
|
|
|
let star p = |
|
|
fix (fun r -> |
|
|
any [ eps ==> (fun () -> []); |
|
|
seq p r ==> (fun (x, xs) -> x :: xs) |
|
|
]) |
|
|
|
|
|
let starskip p = |
|
|
fix (fun r -> |
|
|
any [ eps ==> (fun _ -> ()); |
|
|
seq p r ==> (fun _ -> ()) |
|
|
]) |
|
|
|
|
|
|
|
|
let symbol = |
|
|
let+ c = letter |
|
|
and+ cs = star letter |
|
|
and+ _ = starskip whitespace |
|
|
in |
|
|
let b = Buffer.create 0 in |
|
|
List.iter (Buffer.add_char b) (c :: cs); |
|
|
Buffer.contents b |
|
|
|
|
|
type sexp = |
|
|
| Sym of string |
|
|
| Seq of sexp list |
|
|
|
|
|
|
|
|
let rec generate_list g (fuel : int) = |
|
|
if (fuel = 0) then |
|
|
[] |
|
|
else if fuel = 1 then |
|
|
[ g fuel ] |
|
|
else |
|
|
let i = Random.int fuel in (* Divide the fuel *) |
|
|
let x = g i in |
|
|
let xs = generate_list g (fuel - i) in |
|
|
x :: xs |
|
|
|
|
|
let paren p = |
|
|
let+ () = char '(' >> starskip whitespace |
|
|
and+ x = p |
|
|
and+ () = char ')' >> starskip whitespace |
|
|
in |
|
|
x |
|
|
|
|
|
let sexp = |
|
|
fix (fun r -> |
|
|
any [ symbol ==> (fun s -> Sym s); |
|
|
paren (star r) ==> (fun xs -> Seq xs) |
|
|
]) |
|
|
|
|
|
end |
|
|
|
|
|
module Test = struct |
|
|
open Sexp |
|
|
|
|
|
(* This module randomly generates some huge s-expressions, and |
|
|
then tries to parse them *) |
|
|
|
|
|
|
|
|
let generate_symbol fuel = |
|
|
Char.(escaped (chr (65 + Random.int 26))) |
|
|
|
|
|
let rec generate_sexp fuel = |
|
|
if fuel = 0 then |
|
|
Seq [] |
|
|
else if fuel = 1 then |
|
|
Sym (generate_symbol fuel) |
|
|
else |
|
|
Seq (generate_list generate_sexp fuel) |
|
|
|
|
|
let rec print_sexp out = function |
|
|
| Sym s -> Format.fprintf out "%s" s |
|
|
| Seq xs -> Format.fprintf out "(%a)" print_sexps xs |
|
|
and print_sexps out = function |
|
|
| [] -> () |
|
|
| [s] -> print_sexp out s |
|
|
| x :: xs -> Format.fprintf out "%a %a" print_sexp x print_sexps xs |
|
|
|
|
|
let string_of_sexp sexp = |
|
|
let b = Buffer.create 0 in |
|
|
let out = Format.formatter_of_buffer b in |
|
|
let () = print_sexp out sexp in |
|
|
Buffer.contents b |
|
|
|
|
|
|
|
|
let time f x = |
|
|
let t = Sys.time () in |
|
|
let _ = f x in |
|
|
(Sys.time () -. t) |
|
|
|
|
|
let s1000k = string_of_sexp (generate_sexp 1000000) |
|
|
let s10M = string_of_sexp (generate_sexp 10000000) |
|
|
|
|
|
let test str = |
|
|
let len = String.length str in |
|
|
let t = time (fun s -> Parser.parse sexp s 0) str in |
|
|
let rate = (float_of_int len /. t) in begin |
|
|
Printf.printf "String length: %d bytes\n" len; |
|
|
Printf.printf "Parser elapsed time: %.3f sec\n" t; |
|
|
Printf.printf "Parsing rate: %.3g bytes/sec\n\n" rate; |
|
|
end |
|
|
|
|
|
let _ = test s1000k |
|
|
let _ = test s10M |
|
|
end |