(* MetaOCaml-source to the paper: Christoph A. Herrmann: "Generating Message-Passing Programs from Abstract Specifications by Partial Evaluation" submitted to Parallel Processing Letters Example: Karatsuba's multiplication of multidigit numbers expressed in terms of a special Divide-and-Conquer skeleton Copyright: Chair for Programming, University of Passau, 2004 may freely be used for non-commercial purposes USE IS AT YOUR OWN RISK: NO RESPONSIBILITY IS TAKEN FOR ANY DIRECT OR INDIRECT DISADVANTAGE, DAMAGE ETC. TO ANYBODY OR ANYTHING INCURRED BY USE! please report bugs to herrmann(AT)fmi.uni-passau.de *) (* ---------------------------------------------------------------------------*) open Printf open List open Mpi (* this is a special library (ocamlmpi-1.0.1), available via http://pauillac.inria.fr/~xleroy/software.html, which is required IN ADDITION to a C/MPI installation; combined installation of ocamlmpi and MetaOCaml requires a considerable amount of manual interaction *) let size = comm_size comm_world (* number of processes *) let myrank = comm_rank comm_world (* current process identifier *) (* ---------------------------------------------------------------------------*) (* auxiliary functions for Karatsuba's number multiplication *) let buffers = Array.make 5 [||] let rec pow k n = if n=0 then 1 else k * pow k (n-1) let radix = pow 10 9 let radixl = Int64.of_int radix let zerol = Int64.zero let addl = Int64.add let subl = Int64.sub let mull = Int64.mul let divl = Int64.div let of_int = Int64.of_int let to_int = Int64.to_int let lowpartCY xs = let n = Array.length xs in Array.sub xs 0 ((n+1)/2) let highpartCY xs = let n = Array.length xs in let h = (n+1)/2 in if n mod 2 = 0 then Array.sub xs h (n/2) else Array.init h (fun i -> if i=h-1 then 0 else xs.(i+h)) let mixedpartsCY xs = let ys = lowpartCY xs and zs = highpartCY xs in let n = Array.length ys in let arr = Array.make (n+1) 0 in let cy = ref zerol in for i=0 to n-1 do let sum = addl (of_int ys.(i)) (addl (of_int zs.(i)) !cy) in cy := divl sum radixl; arr.(i) <- to_int (subl sum (mull !cy radixl)); done; arr.(n) <- to_int !cy; arr (* sequential Karatsuba's number multiplication *) let seq_combine (xs,low,high,mixed) = let mo = Array.length (lowpartCY xs) in let zs = Array.make (2*mo + Array.length high) 0 in for i=0 to mo-1 do zs.(i) <- low.(i) done; let cy = ref zerol in for i=mo to 2*mo-1 do let sum = addl (addl !cy (of_int (low.(i)))) (addl (of_int mixed.(i-mo)) (addl (of_int (radix-low.(i-mo))) (of_int (radix-high.(i-mo))))) in let quo = divl sum radixl in zs.(i) <- to_int (subl sum (mull quo radixl)); cy := subl quo (of_int 2) done; for i=2*mo to Array.length low -1 do let sum = addl (addl !cy (addl (of_int (low.(i))) (of_int (high.(i-2*mo))))) (addl (of_int mixed.(i-mo)) (addl (of_int (radix-low.(i-mo))) (of_int (radix-high.(i-mo))))) in let quo = divl sum radixl in zs.(i) <- to_int (subl sum (mull quo radixl)); cy := subl quo (of_int 2) done; for i=Array.length low to mo+Array.length low -1 do let sum = addl (addl !cy (of_int (high.(i-2*mo)))) (addl (of_int mixed.(i-mo)) (addl (of_int (radix-low.(i-mo) )) (of_int (radix-high.(i-mo) )))) in let quo = divl sum radixl in zs.(i) <- to_int (subl sum (mull quo radixl)); cy := subl quo (of_int 2) done; for i=mo+Array.length low to mo+Array.length mixed -1 do let sum = addl (addl !cy (of_int (high.(i-2*mo)))) (addl (of_int mixed.(i-mo)) (addl radixl radixl)) in let quo = divl sum radixl in zs.(i) <- to_int (subl sum (mull quo radixl)); cy := subl quo (of_int 2) done; for i=mo+Array.length mixed -1 to 2*mo+Array.length high -1 do let sum = addl (addl !cy (of_int high.(i-2*mo))) (addl radixl radixl) in let quo = divl sum radixl in zs.(i) <- to_int (subl sum (mull quo radixl)); cy := subl quo (of_int 2) done; zs let karat_combine (xys,xss) = seq_combine(lowpartCY xys,xss.(0),xss.(1),xss.(2)) let rec karat_seq xs ys = let n = Array.length xs and m = Array.length ys in if min m n <= 32 then (* solution for small problem sizes *) let zs = Array.make (n+m) 0 in let cy = ref zerol in for i=0 to n-1 do cy := zerol; for j=0 to m-1 do let sum = addl (addl !cy (of_int zs.(i+j))) (mull (of_int xs.(i)) (of_int ys.(j))) in cy := divl sum radixl; zs.(i+j) <- to_int (subl sum (mull !cy radixl)) done; zs.(i+m) <- to_int !cy done; zs else (* solution for large problem sizes *) let low = karat_seq (lowpartCY xs) (lowpartCY ys) and high = karat_seq (highpartCY xs) (highpartCY ys) and mixed = karat_seq (mixedpartsCY xs) (mixedpartsCY ys) in seq_combine (xs,low,high,mixed) let karat_basic xys = (* input is the concatenation of both coefficient arrays of the operands to multiply *) karat_seq (lowpartCY xys) (highpartCY xys) let karat_divide subproblem xys = (* input is the concatenation of both coefficient arrays of the operands to multiply *) let xs = lowpartCY xys and ys = highpartCY xys in let sel = match subproblem with 0->lowpartCY | 1->highpartCY | 2->mixedpartsCY in let res = Array.append (sel xs) (sel ys) in res (* ---------------------------------------------------------------------------*) (* auxiliary functions to convert integer values between the types "int" and its subranges "tag" and "rank" used by MPI; purpose: avoid run-time type errors due to type casts, we hope for a simpler solution in future releases of MetaOCaml *) let rec totag (i:int) = if i<=0 then if i=0 then 0 else begin printf "negative tag"; 0 end else if i mod 2 = 0 then 2*((totag (i/2)):tag) else 1+2*(totag (i/2)) let rec torank (i:int) = if i<=0 then if i=0 then 0 else begin printf "negative rank"; 0 end else if i mod 2 = 0 then 2*((torank (i/2)):rank) else 1+2*(torank (i/2)) let rec fromrank (i:rank) = if i<=0 then if i=0 then 0 else begin printf "negative rank"; 0 end else if i mod 2 = 0 then 2*((fromrank (i/2)):int) else 1+2*(fromrank (i/2)) (* ---------------------------------------------------------------------------*) (* definition of the language for describing parallelism *) type commrec = { source: int; sindex: int; dest: int; dindex: int; ctag: int; } let rec fromto a b = if a>b then [] else a :: fromto (a+1) b type ('a,'b,'c) exp = Atom of 'a | Comm of (int * (int -> commrec) * (('b, 'c array) code)) | Seq of (int * (int -> ('a,'b,'c) exp)) | Par of (int * (int -> ('a,'b,'c) exp)) let cseq xs = Seq (length xs, nth xs) (* cost model function *) let rec wduSeqPar isSeq n f = let w = ref 0 and d = ref 0 and u = ref 0 in for i=0 to n-1 do let (w1,d1,u1) = wdu (f i) in w := !w + w1; d := if isSeq then !d + d1 else max !d d1; u := if isSeq then max !u u1 else !u + u1 done; (!w,!d,!u) and wdu case = match case with Atom _ -> (1,1,1) | Comm (n,_,_) -> (1,1,1) | Seq (n,f) -> wduSeqPar true n f | Par (n,f) -> wduSeqPar false n f let id x = x (* partial evaluator: takes a parallel specification and generates code; precisely: combines code from smaller parts applying instantiation in "Par" and emission of pieces passed to "Atom" where appropriate *) let send buffers index dest tag acc = .< begin .~acc; Mpi.send (.~buffers).(index) (torank dest) (totag tag) Mpi.comm_world; end >. let receive buffers index source tag acc = .< begin .~acc; (.~buffers).(index) <- Mpi.receive (torank source) (totag tag) Mpi.comm_world; end >. let rec interpret case relpart submasters = match case with Atom addCode -> if myrank=submasters.(relpart) then addCode else id | Comm (n,ci,buffers) -> fun preCode -> if myrank!=submasters.(relpart) then preCode else let step acc i = let c = ci i in if c.source = relpart then send buffers (c.sindex) (submasters.(c.dest)) (c.ctag) acc else if c.dest = relpart then receive buffers (c.dindex) (submasters.(c.source)) (c.ctag) acc else acc in let newCode = fold_left step .<()>. (fromto 0 (n-1)) in .< begin let y = .~preCode in .~newCode ; y end >. | Seq (0,f) -> id | Seq (n,f) when n>0 -> let interp s = interpret s relpart submasters in fun x -> interp (f (n-1)) (interp (Seq (n-1,f)) x) | Par (0,f) -> id | Par (n,f) when n>0 -> let partadrs = Array.make n 0 and base = ref submasters.(relpart) and mypart = ref 0 in for i=0 to n-1 do let (_,_,u) = wdu (f i) in partadrs.(i) <- !base; if !base<=myrank && !base+u>myrank then mypart := i; base := !base + u done; interpret (f !mypart) !mypart partadrs (* ---------------------------------------------------------------------------*) (* implementation of the skeleton "dc" *) let rec dc degree basic divide combine depth = if depth=0 then Atom (fun x -> (.< let (orig,y) = .~x in (orig,basic y) >.)) else Par (degree, fun mypart -> cseq [ Atom (fun x -> .< begin let (orig,y) = .~x in buffers.(depth) <- y; (orig,y) end >.); Comm (degree-1, (fun i -> {source=0; sindex=depth; dest=i+1; dindex=0; ctag=depth }), ..); Atom (fun x -> .< let q = .~x in let (orig,y) = if mypart=0 then q else ([],buffers.(0)) in (y::orig, divide mypart y) >.); dc degree basic divide combine (depth-1); Atom (fun x -> .< let q = .~x in buffers.(0) <- snd q; q >.); Comm (degree-1, (fun i -> {source=i+1; sindex=0; dest=0; dindex=i+1; ctag=depth }), ..); Atom (fun x -> .< let q = .~x in if mypart>0 then q else let (inp::orig,y) = q in let res = combine (inp,buffers) in (orig,res) >.) ]) (* ---------------------------------------------------------------------------*) (* a listing of all program designs to be used with the "dc" skeleton *) let design name psize = match name with | "karat" -> dc 3 karat_basic karat_divide karat_combine psize (* main program, to be used with the following modes: (a) calculation of required number of processes, to be run on a single process, args: karat procs (b) complete cost information, to be run on a single process, args: karat info (c) test of sequential execution times, to be run on a single process, args: karat seqk (d) parallel execution, to be run on as many processes as (a) tells args: karat run *) let main = let name = Sys.argv.(1) in let psize = Scanf.sscanf (Sys.argv.(2)) "%d" (fun i -> i) in let mode = Sys.argv.(3) in match mode with "procs" -> let (w,d,u) = wdu (design name psize) in printf "%d\n" u | "info" -> let (w,d,u) = wdu (design name psize) in printf "design %s: work=%d, depth=%d, processes=%d\n" name w d u | "seqk" -> let base=8 in let q = Array.make 1 [||] in printf "starting seqk\n"; flush stdout; for i=0 to 0 do let indata = Array.init (pow 2 (i+base+1)) (fun i -> 9) in let startTime = wtime () in q.(0) <- karat_basic indata; for i=Array.length q.(0) -1 downto 0 do printf "%d " q.(0).(i) done; printf "\ni=%d, time=%f\n" (i+base) (wtime()-.startTime) done | "run" -> let code _ = interpret (design name psize) 0 [| 0 |] in let runs = Scanf.sscanf (Sys.argv.(4)) "%d" (fun i -> i) in let runtimes = Array.make runs 0.0 in let pet = ref 0.0 in let base = 10 in let q = Array.make 1 [||] in for i=0 to runs-1 do let x = Array.init (pow 2 (i+base+1)) (fun i -> radix-1) in let indata = ([],x) in let startTime = wtime () in let fct = code () .< indata >. in pet := wtime () -. startTime; let startTime = wtime () in let (_,res) = .!fct in q.(0) <- res; runtimes.(i) <- wtime () -. startTime; if myrank=0 then begin printf "run %d: %f\n" (i+base) runtimes.(i); flush stdout end; Mpi.barrier Mpi.comm_world done; if myrank=0 then printf "\n partial evaluation times\n"; for i=0 to size-1 do if myrank=i then begin printf "proc %d: %f\n" i !pet; flush stdout end; done; if myrank=0 then begin printf "runtimes\n"; for i=0 to runs-1 do printf "run %d: %f\n" (i+base) runtimes.(i) done; end else ()