Parallel Matrix Multiplication


http://www.lst.inf.ethz.ch/teaching/lectures/ss10/24/slides/recitation/week04/mm.pdf
Parallel matrix multiplication
Each thread computes its share of the output C
Partition C by columns

http://stackoverflow.com/questions/5484204/parallel-matrix-multiplication-in-java-6
public class Java6MatrixMultiply implements Algorithm {

    private static final int SIZE = 2048;
    private static final int THRESHOLD = 64;
    private static final int MAX_THREADS = Runtime.getRuntime().availableProcessors();

    private final ExecutorService executor = Executors.newFixedThreadPool(MAX_THREADS);

    private float[][] a = new float[SIZE][SIZE];
    private float[][] b = new float[SIZE][SIZE];
    private float[][] c = new float[SIZE][SIZE];

    @Override
    public void initialize() {
        init(a, b, SIZE);
    }

    @Override
    public void execute() {
        MatrixMultiplyTask mainTask =  new MatrixMultiplyTask(a, 0, 0, b, 0, 0, c, 0, 0, SIZE);
        Future future = executor.submit(mainTask);  

        try {
            future.get();
        } catch (Exception e) {
            System.out.println("Error: " + e.getMessage());
        }
    }

    @Override
    public void printResult() {
        check(c, SIZE);

        for (int i = 0; i < SIZE && i <= 10; i++) {
            for (int j = 0; j < SIZE && j <= 10; j++) {         
                if(j == 10) {
                    System.out.print("...");
                }
                else {
                    System.out.print(c[i][j] + " ");
                }
            }

            if(i == 10) {
                System.out.println();
                for(int k = 0; k < 10; k++) System.out.print(" ... ");
            }   

            System.out.println();
        }       

        System.out.println();
    }

    // To simplify checking, fill with all 1's. Answer should be all n's.
    static void init(float[][] a, float[][] b, int n) {
        for (int i = 0; i < n; ++i) {
            for (int j = 0; j < n; ++j) {
                a[i][j] = 1.0F;
                b[i][j] = 1.0F;
            }
        }
    }

    static void check(float[][] c, int n) {
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                if (c[i][j] != n) {
                    throw new Error("Check Failed at [" + i + "][" + j + "]: " + c[i][j]);
                    //System.out.println("Check Failed at [" + i + "][" + j + "]: " + c[i][j]); 
                }
            }
        }       
    }   

    public class Seq implements Runnable {

        private final MatrixMultiplyTask a;
        private final MatrixMultiplyTask b;

        public Seq(MatrixMultiplyTask a, MatrixMultiplyTask b) {
            this.a = a;
            this.b = b;     
        }

        public void run() {
            a.run();
            b.run();
        }   
    }

    private class MatrixMultiplyTask implements Runnable {
        private final float[][] A; // Matrix A
        private final int aRow; // first row of current quadrant of A
        private final int aCol; // first column of current quadrant of A

        private final float[][] B; // Similarly for B
        private final int bRow;
        private final int bCol;

        private final float[][] C; // Similarly for result matrix C
        private final int cRow;
        private final int cCol;

        private final int size;

        public MatrixMultiplyTask(float[][] A, int aRow, int aCol, float[][] B,
                int bRow, int bCol, float[][] C, int cRow, int cCol, int size) {

            this.A = A;
            this.aRow = aRow;
            this.aCol = aCol;
            this.B = B;
            this.bRow = bRow;
            this.bCol = bCol;
            this.C = C;
            this.cRow = cRow;
            this.cCol = cCol;
            this.size = size;
        }   

        public void run() {

            //System.out.println("Thread: " + Thread.currentThread().getName());

            if (size <= THRESHOLD) {
                multiplyStride2();
            } else {

                int h = size / 2;

                        Seq seq1 = new Seq(new MatrixMultiplyTask(A,
                                aRow, aCol, // A11
                                B, bRow, bCol, // B11
                                C, cRow, cCol, // C11
                                h),

                        new MatrixMultiplyTask(A, aRow, aCol + h, // A12
                                B, bRow + h, bCol, // B21
                                C, cRow, cCol, // C11
                                h));

                        Seq seq2 = new Seq(new MatrixMultiplyTask(A,
                                aRow, aCol, // A11
                                B, bRow, bCol + h, // B12
                                C, cRow, cCol + h, // C12
                                h),

                        new MatrixMultiplyTask(A, aRow, aCol + h, // A12
                                B, bRow + h, bCol + h, // B22
                                C, cRow, cCol + h, // C12
                                h));

                        Seq seq3 = new Seq(new MatrixMultiplyTask(A, aRow
                                + h, aCol, // A21
                                B, bRow, bCol, // B11
                                C, cRow + h, cCol, // C21
                                h),

                        new MatrixMultiplyTask(A, aRow + h, aCol + h, // A22
                                B, bRow + h, bCol, // B21
                                C, cRow + h, cCol, // C21
                                h));

                        Seq seq4 = new Seq(new MatrixMultiplyTask(A, aRow
                                + h, aCol, // A21
                                B, bRow, bCol + h, // B12
                                C, cRow + h, cCol + h, // C22
                                h),

                        new MatrixMultiplyTask(A, aRow + h, aCol + h, // A22
                                B, bRow + h, bCol + h, // B22
                                C, cRow + h, cCol + h, // C22
                                h));            



                final FutureTask s1Task = new FutureTask(seq2, null);
                final FutureTask s2Task = new FutureTask(seq3, null);
                final FutureTask s3Task = new FutureTask(seq4, null);

                executor.execute(s1Task);
                executor.execute(s2Task);
                executor.execute(s3Task);

                seq1.run();
                s1Task.run();
                s2Task.run();
                s3Task.run();

                try {
                    s1Task.get();
                    s2Task.get();
                    s3Task.get();
                } catch (Exception e) {
                    System.out.println("Error: " + e.getMessage());
                    executor.shutdownNow();
                }       
            }       
        }       

        public void multiplyStride2() {
            for (int j = 0; j < size; j += 2) {
                for (int i = 0; i < size; i += 2) {

                    float[] a0 = A[aRow + i];
                    float[] a1 = A[aRow + i + 1];

                    float s00 = 0.0F;
                    float s01 = 0.0F;
                    float s10 = 0.0F;
                    float s11 = 0.0F;

                    for (int k = 0; k < size; k += 2) {

                        float[] b0 = B[bRow + k];

                        s00 += a0[aCol + k] * b0[bCol + j];
                        s10 += a1[aCol + k] * b0[bCol + j];
                        s01 += a0[aCol + k] * b0[bCol + j + 1];
                        s11 += a1[aCol + k] * b0[bCol + j + 1];

                        float[] b1 = B[bRow + k + 1];

                        s00 += a0[aCol + k + 1] * b1[bCol + j];
                        s10 += a1[aCol + k + 1] * b1[bCol + j];
                        s01 += a0[aCol + k + 1] * b1[bCol + j + 1];
                        s11 += a1[aCol + k + 1] * b1[bCol + j + 1];
                    }

                    C[cRow + i][cCol + j] += s00;
                    C[cRow + i][cCol + j + 1] += s01;
                    C[cRow + i + 1][cCol + j] += s10;
                    C[cRow + i + 1][cCol + j + 1] += s11;
                }
            }           
        }
    }
}
http://www.cs.indiana.edu/classes/b673-bram/notes/matrix_mult.html

1D Block Layout Algorithm

First consider layout by block columns, with the matrix partitioning given with s = p above. Process k owns all the data in A(:,k), B(:,k), and C(:,k).
Let m = myrank, the MPI identifier of my process number. The algorithm relies on a "block daxpy" form of matrix-matrix multiply
C(:,i) = C(:,i) + A*B(:,i)
           = C(:,i) + sum A(:,j)*B(j,i),
where the sum runs from 0 to p-1. A picture of this for block column i = 1 is below:

1D Layout Algorithm Here
Let's assume the "owner computes" rule: the processor that ends up "owning" the data is the one that will carry out the computations (and concommittent communications) needed to get that data. Then process i has all of C and B that it needs, but it will need to access all of A. The basic idea is to first let each process multiply the part it owns:
C(:,m) = C(:,m) + A(:,m)*B(m,m)
where m = myrank, and then to send its block column of A to the "next" process, in a shift with wrap-around. The corresponding parallel algorithm is
C(:,m) = C(:,m) + A(:,m)*B(m,m)
Copy A(:,m) to buffer S
for i = 1: p-1
  send S to process (m+1) mod p
  receive S from process (m-1) mod p
  C(:,m) = C(:,m) + S*B((m-i) mod p,m)
end
Quick reality check: which if any of the send/receive pair above should be blocking? Which need to be nonblocking?
Now apply a timing model to this. Assume that communication follows the linear model
t(k) = tlat + k*tbw,
where k is the number of double words sent, tlat is the latency time, and tbw is the time associated with the bandwidth of the communication channel. Each computation step will require
(2 n/p) * (n/p) * n = (2n3)/p2 flops
Each iteration of the for loop involves
t = tlat + (n2)/p*tbw
communication time. So the total time is
(2n3)/p2 + (p-1)[ (2n3)/p2 + tlat + n2/p * tbw ]
= 2n3/p + (p-1)tlat + n2(p-1)/p tbw
Notice that if the total time consisted only of the first term, then the algorithm has perfect speedup - since it is the total number of flops (which we are assuming take one time unit each) divided by the number of processors. Hence the other two terms give the overhead due to parallelism. The theoretical speedup is then obtained by dividing the above time into 2n3 (why?), and the parallel efficiency is obtained by dividing speedup by p for
                 1
E =   ------------------------------
       1 + p(p-1)tlat  +  (p-1)tbw
           ----------     --------
              2n3            2n
Note that as n increases, the efficiency approaches 1. This indicates that high efficiencies should be attainable on this algorithm. What could prevent that from happening? The above communication pattern has a natural mapping to which kind(s) of interconnect networks for a distributed memory machine? Although MPI provides an interconnect-independent view, it is important that you be able to "see" natural mappings like these. The way to do it is by looking at the communications involved in the algorithm.
Also note that the cost associated with computation is O(n3), while the amount of communication is O(n2). This is the key to another definition of a "scalable" distributed memory algorithm; the communication costs grow slower than the computation costs. Here, however, the definition of scalable has undergone a subtle shift from that given earlier in the course, where it meant "parallel efficiency is bounded below by a positive number, as p grows".

2D Block Layout Algorithm

As you should have guessed above, the 1D algorithm maps naturally to a ring of processors. Of course, since a ring interconnect network is embedded in a rectangular mesh with wrap-around and in a hypercube, it should still work alright on such machines. But it does not fully use the available interconnections - which suggests a better algorithm is possible.
Suppose now that p is a perfect square, and s2 = p. The matrices are partitioned and assigned to a mesh of processes. This version of matrix-matrix uses the formula for matrix-matrix product that has a block inner product as the innermost loop:
 C(i,j) = C(i,j) + sum A(i,k)*B(k,j)
where the summation goes from k = 0 to k = s-1. For the algorithm below, note that we can perform the summation in any order desired (by falsely assuming matrix multiplication is associative), so it can be instead written as
 C(i,j) = C(i,j) + sum A(i,(i+j+k)mod s)*B((i+j+k)mod s,j)
The algorithm has a preliminary skewing phase, followed by a multiplication/shift phase indicated by the shift of indices above. Initially suppose that s = 3, so that A looks like
      [A(0,0)   A(0,1)   A(0,2) ]
 A =  |A(1,0)   A(1,1)   A(1,2) |
      [A(2,0)   A(2,1)   A(2,3) ]
and B, C are similarly partitioned.
Phase 1A: for i = 0:s-1
            left circular shift row i of A by amount i:
              A(i,j) <-- A(i, (j+i)mod s)
          end for
At the end of Phase 1A, the matrix A looks like:
      [A(0,0)   A(0,1)   A(0,2) ]
 A =  |A(1,1)   A(1,2)   A(1,0) |
      [A(2,2)   A(2,0)   A(2,1) ]
Phase 1B: for i = 0:s-1
            upward circular shift column j of B by amount j:
              B(i,j) <-- B((i+j)mod s, j)
          end for
At the end of Phase 1B, the matrix B looks like:
      [B(0,0)   B(1,1)   B(2,2) ]
 B =  |B(1,0)   B(2,1)   B(0,2) |
      [B(2,0)   B(0,1)   B(1,2) ]
After this skewing, the actual computation begins, with a circular shift on each step:
Phase 2: for k = 0:s-1
            for i = 0:s-1 and j = 0:s-1
               C(i,j) = C(i,j) + A(i,j)*B(i,j)
            end 
            left circular shift each row of A by 1:
              A(i,j) <-- A(i, (j+1)mod s)
            upward circular shift each column of B by 1:
              B(i,j) <-- B((i+1)mod s, j)
Here is the set of pictures for this process, for s = 4:

2D Skew Mat-Mat
2D Skew Mat-Mat
The timing model for this is straightforward. First, note that on the shift phase 1A each row can proceed independently. The last row has the furthest to shift blocks, requiring (in a mesh connected network) s-1 messages to get a block where it belongs. Since each process in the row can be sending a message simultaneously, this means communication time for Phase 1A is
(s-1) (tlat + (n/s)2 tbw)
and clearly Phase 1B takes the same time. Phase 2 has each shift costing tlat + (n/s)2 tbw, and each local multiplication taking 2(n/s)3 flops. Each loop iteration in the "for i = 0:s-1 and j = 0:s-1" can be done in parallel. Since p = s2, the theoretical time is
Tp = 2n3/p + 3*sqrt(p)tlat + 3n2/sqrt(p) tbw
and the corresponding speedup and efficiency is easily obtained. Note that like the first version there is no redundant arithmetic, but now the communication cost is a factor of sqrt(p) smaller. Why is this to be expected?
http://stackoverflow.com/questions/5254294/using-executorservice-with-a-tree-of-tasks-to-perform
Java 7 has the concept of a ForkJoinPool that allows a task to "fork" off another task by submitting it tot he same Executor. Then gives it the option of later attempting to "help join" that task by attempting to run it if it has not been run.
Java 7 has the concept of a ForkJoinPool that allows a task to "fork" off another task by submitting it tot he same Executor. Then gives it the option of later attempting to "help join" that task by attempting to run it if it has not been run.

http://www.cse.buffalo.edu/faculty/miller/Courses/CSE633/Ortega-Fall-2012-CSE633.pdf
http://www.slideshare.net/pkpramit/matrix-multiplicationan-example-of-concurrent-programming
http://www.cs.berkeley.edu/~yelick/cs267-sp04/lectures/13/lect13-pmatmul-6x.pdf

Labels

LeetCode (1432) GeeksforGeeks (1122) LeetCode - Review (1067) Review (882) Algorithm (668) to-do (609) Classic Algorithm (270) Google Interview (237) Classic Interview (222) Dynamic Programming (220) DP (186) Bit Algorithms (145) POJ (141) Math (137) Tree (132) LeetCode - Phone (129) EPI (122) Cracking Coding Interview (119) DFS (115) Difficult Algorithm (115) Lintcode (115) Different Solutions (110) Smart Algorithm (104) Binary Search (96) BFS (91) HackerRank (90) Binary Tree (86) Hard (79) Two Pointers (78) Stack (76) Company-Facebook (75) BST (72) Graph Algorithm (72) Time Complexity (69) Greedy Algorithm (68) Interval (63) Company - Google (62) Geometry Algorithm (61) Interview Corner (61) LeetCode - Extended (61) Union-Find (60) Trie (58) Advanced Data Structure (56) List (56) Priority Queue (53) Codility (52) ComProGuide (50) LeetCode Hard (50) Matrix (50) Bisection (48) Segment Tree (48) Sliding Window (48) USACO (46) Space Optimization (45) Company-Airbnb (41) Greedy (41) Mathematical Algorithm (41) Tree - Post-Order (41) ACM-ICPC (40) Algorithm Interview (40) Data Structure Design (40) Graph (40) Backtracking (39) Data Structure (39) Jobdu (39) Random (39) Codeforces (38) Knapsack (38) LeetCode - DP (38) Recursive Algorithm (38) String Algorithm (38) TopCoder (38) Sort (37) Introduction to Algorithms (36) Pre-Sort (36) Beauty of Programming (35) Must Known (34) Binary Search Tree (33) Follow Up (33) prismoskills (33) Palindrome (32) Permutation (31) Array (30) Google Code Jam (30) HDU (30) Array O(N) (29) Logic Thinking (29) Monotonic Stack (29) Puzzles (29) Code - Detail (27) Company-Zenefits (27) Microsoft 100 - July (27) Queue (27) Binary Indexed Trees (26) TreeMap (26) to-do-must (26) 1point3acres (25) GeeksQuiz (25) Merge Sort (25) Reverse Thinking (25) hihocoder (25) Company - LinkedIn (24) Hash (24) High Frequency (24) Summary (24) Divide and Conquer (23) Proof (23) Game Theory (22) Topological Sort (22) Lintcode - Review (21) Tree - Modification (21) Algorithm Game (20) CareerCup (20) Company - Twitter (20) DFS + Review (20) DP - Relation (20) Brain Teaser (19) DP - Tree (19) Left and Right Array (19) O(N) (19) Sweep Line (19) UVA (19) DP - Bit Masking (18) LeetCode - Thinking (18) KMP (17) LeetCode - TODO (17) Probabilities (17) Simulation (17) String Search (17) Codercareer (16) Company-Uber (16) Iterator (16) Number (16) O(1) Space (16) Shortest Path (16) itint5 (16) DFS+Cache (15) Dijkstra (15) Euclidean GCD (15) Heap (15) LeetCode - Hard (15) Majority (15) Number Theory (15) Rolling Hash (15) Tree Traversal (15) Brute Force (14) Bucket Sort (14) DP - Knapsack (14) DP - Probability (14) Difficult (14) Fast Power Algorithm (14) Pattern (14) Prefix Sum (14) TreeSet (14) Algorithm Videos (13) Amazon Interview (13) Basic Algorithm (13) Codechef (13) Combination (13) Computational Geometry (13) DP - Digit (13) LCA (13) LeetCode - DFS (13) Linked List (13) Long Increasing Sequence(LIS) (13) Math-Divisible (13) Reservoir Sampling (13) mitbbs (13) Algorithm - How To (12) Company - Microsoft (12) DP - Interval (12) DP - Multiple Relation (12) DP - Relation Optimization (12) LeetCode - Classic (12) Level Order Traversal (12) Prime (12) Pruning (12) Reconstruct Tree (12) Thinking (12) X Sum (12) AOJ (11) Bit Mask (11) Company-Snapchat (11) DP - Space Optimization (11) Dequeue (11) Graph DFS (11) MinMax (11) Miscs (11) Princeton (11) Quick Sort (11) Stack - Tree (11) 尺取法 (11) 挑战程序设计竞赛 (11) Coin Change (10) DFS+Backtracking (10) Facebook Hacker Cup (10) Fast Slow Pointers (10) HackerRank Easy (10) Interval Tree (10) Limited Range (10) Matrix - Traverse (10) Monotone Queue (10) SPOJ (10) Starting Point (10) States (10) Stock (10) Theory (10) Tutorialhorizon (10) Kadane - Extended (9) Mathblog (9) Max-Min Flow (9) Maze (9) Median (9) O(32N) (9) Quick Select (9) Stack Overflow (9) System Design (9) Tree - Conversion (9) Use XOR (9) Book Notes (8) Company-Amazon (8) DFS+BFS (8) DP - States (8) Expression (8) Longest Common Subsequence(LCS) (8) One Pass (8) Quadtrees (8) Traversal Once (8) Trie - Suffix (8) 穷竭搜索 (8) Algorithm Problem List (7) All Sub (7) Catalan Number (7) Cycle (7) DP - Cases (7) Facebook Interview (7) Fibonacci Numbers (7) Flood fill (7) Game Nim (7) Graph BFS (7) HackerRank Difficult (7) Hackerearth (7) Inversion (7) Kadane’s Algorithm (7) Manacher (7) Morris Traversal (7) Multiple Data Structures (7) Normalized Key (7) O(XN) (7) Radix Sort (7) Recursion (7) Sampling (7) Suffix Array (7) Tech-Queries (7) Tree - Serialization (7) Tree DP (7) Trie - Bit (7) 蓝桥杯 (7) Algorithm - Brain Teaser (6) BFS - Priority Queue (6) BFS - Unusual (6) Classic Data Structure Impl (6) DP - 2D (6) DP - Monotone Queue (6) DP - Unusual (6) DP-Space Optimization (6) Dutch Flag (6) How To (6) Interviewstreet (6) Knapsack - MultiplePack (6) Local MinMax (6) MST (6) Minimum Spanning Tree (6) Number - Reach (6) Parentheses (6) Pre-Sum (6) Probability (6) Programming Pearls (6) Rabin-Karp (6) Reverse (6) Scan from right (6) Schedule (6) Stream (6) Subset Sum (6) TSP (6) Xpost (6) n00tc0d3r (6) reddit (6) AI (5) Abbreviation (5) Anagram (5) Art Of Programming-July (5) Assumption (5) Bellman Ford (5) Big Data (5) Code - Solid (5) Code Kata (5) Codility-lessons (5) Coding (5) Company - WMware (5) Convex Hull (5) Crazyforcode (5) DFS - Multiple (5) DFS+DP (5) DP - Multi-Dimension (5) DP-Multiple Relation (5) Eulerian Cycle (5) Graph - Unusual (5) Graph Cycle (5) Hash Strategy (5) Immutability (5) Java (5) LogN (5) Manhattan Distance (5) Matrix Chain Multiplication (5) N Queens (5) Pre-Sort: Index (5) Quick Partition (5) Quora (5) Randomized Algorithms (5) Resources (5) Robot (5) SPFA(Shortest Path Faster Algorithm) (5) Shuffle (5) Sieve of Eratosthenes (5) Strongly Connected Components (5) Subarray Sum (5) Sudoku (5) Suffix Tree (5) Swap (5) Threaded (5) Tree - Creation (5) Warshall Floyd (5) Word Search (5) jiuzhang (5)

Popular Posts