Open In App

Cache Oblivious Algorithm

Cache oblivious is a way of achieving algorithms that are efficient in arbitrary memory hierarchies without the use of complicated multi-level memory models. Cache oblivious algorithms are algorithms that use asymptotically optimal amounts of work, move data asymptotically optimally among multiple levels of cache, and indirectly use processor cache. 

This article focuses on discussing the following topics:



Cache Oblivious Model

Cache Oblivious Models are built in a way so they can be independent of constant factors, like the size of the cache memory.



Features:

Justification of the Model 

The Cache Oblivious Model can be justified based on the following points:

External Memory Model

Associative Cache:

Cache set number = [Main Memory block address] modulo [number of sets in the cache] 

Optimal Cache Replacement Policy:

Why use Cache-Oblivious Algorithm?

Tall Cache Assumption

The ideal cache model is an assumption which has an assumption called “tall cache”, which is used to calculate the cache complexity of an algorithm. This assertion has a mathematical equation-

Z = Ω(B2

Here, 
Z is the size of the cache.
B is the size of the cache line.
Ω symbol is used to represent the lower bound of the algorithm or data. And that is the top speed any algorithm can get to.

Examples of Cache-Oblivious Algorithm

1. Array Reversal:

2. Matrix Transpose:

If n >= m,  
we partition 
A = (A1, A2),  
B = (B1, B2)




#include <iostream>
#include <vector>
 
#define BLOCK_SIZE 64
 
void transpose(int n, int m, std::vector<std::vector<int>> &A)
{
    for (int i = 0; i < n; i += BLOCK_SIZE) {
        for (int j = 0; j < m; j += BLOCK_SIZE) {
            for (int k = i; k < i + BLOCK_SIZE && k < n; ++k) {
                for (int l = j; l < j + BLOCK_SIZE && l < m; ++l) {
                    int temp = A[k][l];
                    A[k][l] = A[l][k];
                    A[l][k] = temp;
                }
            }
        }
    }
}
 
int main()
{
    int n = 1024, m = 1024;
    std::vector<std::vector<int>> A(n, std::vector<int>(m, 0));
 
    // Initialize the matrix with some values
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < m; ++j) {
            A[i][j] = i * n + j;
        }
    }
 
    transpose(n, m, A);
 
    return 0;
}




// C code for implementing above approach
#include <stdio.h>
#include <stdlib.h>
 
#define BLOCK_SIZE 64
 
void transpose(int n, int m, int A[n][m])
{
    for (int i = 0; i < n; i += BLOCK_SIZE) {
        for (int j = 0; j < m; j += BLOCK_SIZE) {
            for (int k = i; k < i + BLOCK_SIZE && k < n;
                 ++k) {
                for (int l = j; l < j + BLOCK_SIZE && l < m;
                     ++l) {
                    int temp = A[k][l];
                    A[k][l] = A[l][k];
                    A[l][k] = temp;
                }
            }
        }
    }
}
 
// Driver's code
int main(int argc, char* argv[])
{
    int n = 1024, m = 1024;
    int A[n][m];
 
    // Initialize the matrix with some values
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < m; ++j) {
            A[i][j] = i * n + j;
        }
    }
 
    transpose(n, m, A);
 
    return 0;
}




import java.util.Arrays;
 
public class Main{
 
    // Define block size as a constant
    private static final int BLOCK_SIZE = 64;
 
    // Function to transpose a matrix using block-wise transposition
    private static void transpose(int n, int m, int[][] A) {
        for (int i = 0; i < n; i += BLOCK_SIZE) {
            for (int j = 0; j < m; j += BLOCK_SIZE) {
                // Process each block within the matrix
                for (int k = i; k < i + BLOCK_SIZE && k < n; ++k) {
                    for (int l = j; l < j + BLOCK_SIZE && l < m; ++l) {
                        // Swap elements across the diagonal
                        int temp = A[k][l];
                        A[k][l] = A[l][k];
                        A[l][k] = temp;
                    }
                }
            }
        }
    }
 
    public static void main(String[] args) {
        int n = 1024, m = 1024;
        int[][] A = new int[n][m];
 
        // Initialize the matrix with some values
        for (int i = 0; i < n; ++i) {
            for (int j = 0; j < m; ++j) {
                A[i][j] = i * n + j;
            }
        }
 
        // Perform matrix transposition
        transpose(n, m, A);
    }
}




def transpose(n, m, A):
    BLOCK_SIZE = 64
 
    for i in range(0, n, BLOCK_SIZE):
        for j in range(0, m, BLOCK_SIZE):
            for k in range(i, min(i + BLOCK_SIZE, n)):
                for l in range(j, min(j + BLOCK_SIZE, m)):
                    # Swap elements between A[k][l] and A[l][k]
                    temp = A[k][l]
                    A[k][l] = A[l][k]
                    A[l][k] = temp
 
 
def main():
    n, m = 1024, 1024
    A = [[0] * m for _ in range(n)]
 
    # Initialize the matrix with some values
    for i in range(n):
        for j in range(m):
            A[i][j] = i * n + j
 
    # Call the transpose function
    transpose(n, m, A)
 
    return 0
 
 
if __name__ == "__main__":
    main()




using System;
 
public class Program
{
    // Define block size as a constant
    private const int BLOCK_SIZE = 64;
 
    // Function to transpose a matrix using block-wise transposition
    private static void Transpose(int n, int m, int[,] A)
    {
        for (int i = 0; i < n; i += BLOCK_SIZE)
        {
            for (int j = 0; j < m; j += BLOCK_SIZE)
            {
                // Process each block within the matrix
                for (int k = i; k < i + BLOCK_SIZE && k < n; ++k)
                {
                    for (int l = j; l < j + BLOCK_SIZE && l < m; ++l)
                    {
                        // Swap elements across the diagonal
                        int temp = A[k, l];
                        A[k, l] = A[l, k];
                        A[l, k] = temp;
                    }
                }
            }
        }
    }
 
    public static void Main(string[] args)
    {
        int n = 1024, m = 1024;
        int[,] A = new int[n, m];
 
        // Initialize the matrix with some values
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < m; ++j)
            {
                A[i, j] = i * n + j;
            }
        }
 
        // Perform matrix transposition
        Transpose(n, m, A);
    }
}
//This code is contributed by Utkarsh




// Define the block size constant
const BLOCK_SIZE = 64;
 
// Function to transpose a matrix
function transpose(n, m, A) {
    // Iterate through matrix blocks
    for (let i = 0; i < n; i += BLOCK_SIZE) {
        for (let j = 0; j < m; j += BLOCK_SIZE) {
            // Iterate within each block
            for (let k = i; k < i + BLOCK_SIZE && k < n; ++k) {
                for (let l = j; l < j + BLOCK_SIZE && l < m; ++l) {
                    // Swap elements diagonally
                    let temp = A[k][l];
                    A[k][l] = A[l][k];
                    A[l][k] = temp;
                }
            }
        }
    }
}
 
// Main function
function main() {
    // Define matrix dimensions
    const n = 1024, m = 1024;
 
    // Create and initialize the matrix
    const A = Array.from({ length: n }, () => Array(m).fill(0));
    for (let i = 0; i < n; ++i) {
        for (let j = 0; j < m; ++j) {
            A[i][j] = i * n + j;
        }
    }
 
    // Call transpose function
    transpose(n, m, A);
 
    return A; // Return transposed matrix
}
main();
// Call the main functi

3. Binary search tree (Divide And Conquer Algorithm):

4. Merge Sort:

Θ( N /B [logM/B N/ B ])

The total no. of memory transfers for this kind of sorting algorithm would be: T(N) = M/B T(N/ M/B) + Θ(N/B)
The recursion tree has Θ(N/B) leaves,for a leaf cost of Θ(N/B)
The number of levels in the recursion tree is logM/B N, so the total cost is Θ(N/B logM/B N/B)

T(N) = 2T(N/2) + Θ(N/B)

References


Article Tags :