(* MetaOCaml-source to the paper:
Christoph A. Herrmann:
"Functional Meta-Programming in the Construction of Parallel Programs",
presented at the workshop
"Constructive Methods of Parallel Programming (CMPP'04)", Stirling, UK
Example: Karatsuba polynomial multiplication expressed
in terms of a special Divide-and-Conquer skeleton
Copyright: University of Passau, 2004
may freely be used for non-commercial purposes
USE IS AT YOUR OWN RISK: NO RESPONSIBILITY IS TAKEN FOR ANY DIRECT OR
INDIRECT DISADVANTAGE, DAMAGE ETC. TO ANYBODY OR ANYTHING INCURRED BY USE!
please report bugs to herrmann(AT)fmi.uni-passau.de
*)
(* ---------------------------------------------------------------------------*)
open Printf
open List
open Mpi (* this is a special library (ocamlmpi-1.0.1), available via
http://pauillac.inria.fr/~xleroy/software.html,
which is required IN ADDITION to a C/MPI installation;
combined installation of ocamlmpi and MetaOCaml requires
a considerable amount of manual interaction *)
let size = comm_size comm_world (* number of processes *)
let myrank = comm_rank comm_world (* current process identifier *)
(* ---------------------------------------------------------------------------*)
(* auxiliary functions for Karatsuba's polynomial multiplication *)
let al2 xs = Array.length xs / 2
let lowpart xs = let n = al2 xs in Array.sub xs 0 n
let highpart xs = let n = al2 xs in Array.sub xs n n
let mixedparts xs = let n = al2 xs in Array.init n (fun i -> xs.(i)+xs.(n+i))
(* sequential Karatsuba's polynomial multiplication,
"xs" and "ys" have to be arrays of length the same power of two *)
let rec karat_seq xs ys =
let n = Array.length xs in
if n<=16
then (* solution for small problem sizes *)
let zs = Array.make (2*n) 0 in
for i=0 to n-1 do
for j=0 to n-1 do
zs.(i+j) <- zs.(i+j) + xs.(i) * ys.(j)
done
done;
zs
else (* solution for large problem sizes *)
let low = karat_seq (lowpart xs) (lowpart ys)
and high = karat_seq (highpart xs) (highpart ys)
and mixed = karat_seq (mixedparts xs) (mixedparts ys) in
let zs = Array.append low high in
for i=0 to n-1 do
zs.(i+n/2) <- zs.(i+n/2) + mixed.(i) - low.(i)-high.(i)
done;
zs
(* customizing functions for Karatsuba's polynomial multiplication
to be used in combination with the divide-and-conquer skeleton "dc" *)
let karat_basic xys = (* input is the concatenation of both coefficient
arrays of the operands to multiply *)
karat_seq (lowpart xys) (highpart xys)
let karat_divide subproblem xys = (* input is the concatenation of both
coefficient arrays of the operands
to multiply *)
let xs = lowpart xys and ys = highpart xys in
let sel = match subproblem with 0->lowpart | 1->highpart | 2->mixedparts
in Array.append (sel xs) (sel ys)
let karat_combine xss = (* input is an array of length three, of
which each element is one subproblem
solution *)
let low = xss.(0) and high = xss.(1) and mix = xss.(2) in
let ys = Array.append low high
and n = Array.length low in
for i=0 to n-1 do
ys.(i+n/2) <- ys.(i+n/2) + mix.(i) - low.(i) - high.(i)
done;
ys
(* ---------------------------------------------------------------------------*)
(* auxiliary functions to convert integer values between the types "int"
and its subranges "tag" and "rank" used by MPI;
purpose: avoid run-time type errors due to type casts, we hope for
a simpler solution in future releases of MetaOCaml *)
let rec totag (i:int) =
if i<=0
then
if i=0
then 0
else
begin
printf "negative tag";
0
end
else
if i mod 2 = 0
then 2*((totag (i/2)):tag)
else 1+2*(totag (i/2))
let rec torank (i:int) =
if i<=0
then
if i=0
then 0
else
begin
printf "negative rank";
0
end
else
if i mod 2 = 0
then 2*((torank (i/2)):rank)
else 1+2*(torank (i/2))
let rec fromrank (i:rank) =
if i<=0
then
if i=0
then 0
else
begin
printf "negative rank";
0
end
else
if i mod 2 = 0
then 2*((fromrank (i/2)):int)
else 1+2*(fromrank (i/2))
(* ---------------------------------------------------------------------------*)
(* definition of the language for describing parallelism *)
type 'a exp =
Atom of 'a
| Seq of (int * (int -> 'a exp))
| Par of (int * (int array -> int -> 'a exp))
let cseq xs = Seq (length xs, nth xs)
(* cost model function *)
let rec wduSeqPar isSeq n f =
let w = ref 0 and d = ref 0 and u = ref 0 in
for i=0 to n-1 do
let (w1,d1,u1) = wdu (f i) in
w := !w + w1;
d := if isSeq then !d + d1 else max !d d1;
u := if isSeq then max !u u1 else !u + u1
done;
(!w,!d,!u)
and wdu case = match case with
Atom _ -> (1,1,1)
| Seq (n,f) -> wduSeqPar true n f
| Par (n,f) -> wduSeqPar false n (f (Array.make n 0))
let id x = x
(* partial evaluator: takes a parallel specification and generates
code; precisely: combines code from smaller parts applying
instantiation in "Par" and emission of pieces passed to "Atom"
where appropriate *)
let rec peval case relpart submasters = match case with
Atom addCode -> if myrank=submasters.(relpart)
then addCode
else id
| Seq (0,f) -> id
| Seq (n,f) when n>0 -> let pev s = peval s relpart submasters
in fun x -> pev (f (n-1)) (pev (Seq (n-1,f)) x)
| Par (0,f) -> id
| Par (n,f) when n>0 -> let partadrs = Array.make n 0
and base = ref submasters.(relpart)
and mypart = ref 0 in
for i=0 to n-1 do
let (_,_,u) = wdu (f partadrs i) in
partadrs.(i) <- !base;
if !base<=myrank && !base+u>myrank then mypart := i;
base := !base + u
done;
peval (f partadrs !mypart) !mypart partadrs
(* ---------------------------------------------------------------------------*)
(* implementation of the skeleton "dc" *)
let send x d t = Mpi.send x (torank d) (totag t) Mpi.comm_world
let receive d t = Mpi.receive (torank d) (totag t) Mpi.comm_world
let rec dc degree basic divide combine depth =
if depth=0
then Atom (fun x -> (.< let y = .~x in basic y >.))
else
let master partners =
cseq [ Atom (fun x -> .< let y = .~x in
for i=1 to degree-1 do
send y (partners.(i)) depth
done;
divide 0 y
>.);
dc degree basic divide combine (depth-1);
Atom (fun x -> .< let y = .~x in
let tmpdata = Array.make degree [||] in
tmpdata.(0) <- y;
for i=1 to degree-1 do
tmpdata.(i) <- receive (partners.(i)) depth
done;
combine tmpdata
>.)
]
and worker mypart partner =
cseq [ Atom (fun x -> .< begin
.~x;
let y = receive partner depth in
divide mypart y
end
>.);
dc degree basic divide combine (depth-1);
Atom (fun x -> .< let y = .~x in
send y partner depth;
y
>.)
]
in Par (degree,
fun partners mypart -> if mypart=0
then master partners
else worker mypart partners.(0))
(* ---------------------------------------------------------------------------*)
(* a listing of all program designs to be used with the "dc" skeleton *)
let design name psize = match name with
| "karat" -> dc 3 karat_basic
karat_divide
karat_combine
psize
(* auxiliary power function *)
let rec pow k n = if n=0 then 1 else k * pow k (n-1)
(* main program, to be used with the following modes:
(a) calculation of required number of processes,
to be run on a single process,
args: karat procs
(b) complete cost information,
to be run on a single process,
args: karat info
(c) test of sequential execution times,
to be run on a single process,
args: karat seqk
(d) parallel execution, to be run on as many
processes as (a) tells
args: karat run
*)
let main =
let name = Sys.argv.(1) in
let psize = Scanf.sscanf (Sys.argv.(2)) "%d" (fun i -> i) in
let mode = Sys.argv.(3)
in
match mode with
"procs" ->
let (w,d,u) = wdu (design name psize) in
printf "%d\n" u
| "info" ->
let (w,d,u) = wdu (design name psize) in
printf "design %s: work=%d, depth=%d, processes=%d\n" name w d u
| "seqk" ->
let base=15 in
let q = Array.make 1 [||] in
for i=0 to 5 do
let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in
let startTime = wtime () in
q.(0) <- karat_basic indata;
printf "i=%d, time=%f\n" (i+base) (wtime()-.startTime)
done
| "run" ->
let code _ = peval (design name psize) 0 [| 0 |] in
let runs = Scanf.sscanf (Sys.argv.(4)) "%d" (fun i -> i) in
let runtimes = Array.make runs 0.0 in
let pet = ref 0.0 in
let base = 15 in
let q = Array.make 1 [||] in
Mpi.barrier Mpi.comm_world;
for i=0 to runs-1 do
let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in
let startTime = wtime () in
let fct = code () .< indata >. in
pet := wtime () -. startTime;
let startTime = wtime () in
q.(0) <- .!fct;
Mpi.barrier Mpi.comm_world;
runtimes.(i) <- wtime () -. startTime
done;
printf "pid %d finished \n" (fromrank myrank);
flush stdout;
Mpi.barrier Mpi.comm_world;
if myrank=0
then begin
printf "\n\npevaltimes\n"
end;
for i=0 to size-1 do
Mpi.barrier Mpi.comm_world;
if myrank=i
then
begin
printf "proc %d: %f\n" i !pet;
flush stdout
end;
Mpi.barrier Mpi.comm_world;
done;
Mpi.barrier Mpi.comm_world;
if myrank=0
then
begin
printf "runtimes\n";
for i=0 to runs-1 do
printf "run %d: %f\n" (i+base) runtimes.(i)
done;
end