diff --git a/serial/serial_gs_pagerank.c b/serial/serial_gs_pagerank.c new file mode 100644 index 0000000..ad96d24 --- /dev/null +++ b/serial/serial_gs_pagerank.c @@ -0,0 +1,50 @@ +#include + +#include "serial_gs_pagerank_functions.h" + +struct timeval startwtime, endwtime; + +int main(int argc, char **argv) { + CsrSparseMatrix transitionMatrix = initCsrSparseMatrix(); + double *pagerankVector; + bool convergenceStatus; + Parameters parameters; + int maxIterationsForConvergence; + maxIterationsForConvergence = 0; + parseArguments(argc, argv, ¶meters); + + initialize(&transitionMatrix, &pagerankVector, ¶meters); + + // Starts wall-clock timer + gettimeofday (&startwtime, NULL); + int* iterations = (int *)malloc(parameters.numberOfPages*sizeof(int)); + + iterations = pagerank(&transitionMatrix, &pagerankVector, + &convergenceStatus, parameters, &maxIterationsForConvergence); + if (parameters.verbose) { + printf(ANSI_COLOR_YELLOW "\n----- RESULTS -----\n" ANSI_COLOR_RESET); + if (convergenceStatus) { + printf(ANSI_COLOR_GREEN "Pagerank converged after %d iterations!\n" \ + ANSI_COLOR_RESET, maxIterationsForConvergence); + } else { + printf(ANSI_COLOR_RED "Pagerank did not converge after max number of" \ + " iterations (%d) was reached!\n" ANSI_COLOR_RESET, maxIterationsForConvergence); + } + } + + // Stops wall-clock timer + gettimeofday (&endwtime, NULL); + double seq_time = (double)((endwtime.tv_usec - startwtime.tv_usec)/1.0e6 + + endwtime.tv_sec - startwtime.tv_sec); + printf("%s wall clock time = %f\n","Pagerank (Gauss-Seidel method), serial implementation", + seq_time); + if (!parameters.history) { + // Always outputs numberOfPages, max_iterations, last pagerank and iterations + // for all pages + savePagerankToFile(parameters.outputFilename, false, pagerankVector, + parameters.numberOfPages, iterations, maxIterationsForConvergence); + } + + free(pagerankVector); + destroyCsrSparseMatrix(&transitionMatrix); +} diff --git a/serial/serial_gs_pagerank_functions.c b/serial/serial_gs_pagerank_functions.c new file mode 100644 index 0000000..690ce50 --- /dev/null +++ b/serial/serial_gs_pagerank_functions.c @@ -0,0 +1,520 @@ +/* ===== INCLUDES ===== */ + +#include "serial_gs_pagerank_functions.h" + +/* ===== CONSTANTS ===== */ + +const char *ARGUMENT_CONVERGENCE_TOLERANCE = "-c"; +const char *ARGUMENT_MAX_ITERATIONS = "-m"; +const char *ARGUMENT_DAMPING_FACTOR = "-a"; +const char *ARGUMENT_VERBAL_OUTPUT = "-v"; +const char *ARGUMENT_OUTPUT_HISTORY = "-h"; +const char *ARGUMENT_OUTPUT_FILENAME = "-o"; + +const int NUMERICAL_BASE = 10; +char *DEFAULT_OUTPUT_FILENAME = "pagerank_output"; +const int FILE_READ_BUFFER_SIZE = 4096; + +const int CONVERGENCE_CHECK_ITERATION_PERIOD = 2; +const int SPARSITY_INCREASE_ITERATION_PERIOD = 10; + +/* ===== FUNCTIONS ===== */ + +int* pagerank(CsrSparseMatrix *transitionMatrix, double **pagerankVector, + bool *convergenceStatus, Parameters parameters, int* maxIterationsForConvergence) { + // Variables declaration + int numberOfPages = parameters.numberOfPages; + double delta, *pagerankDifference, *previousPagerankVector, + *convergedPagerankVector, *linksFromConvergedPagesPagerankVector; + CooSparseMatrix linksFromConvergedPages = initCooSparseMatrix(); + bool *convergenceMatrix; + + int* iterations = (int *)malloc(numberOfPages*sizeof(int)); + // Space allocation + { + size_t sizeofDouble = sizeof(double); + // pagerankDifference used to calculate delta + pagerankDifference = (double *) malloc(numberOfPages * sizeofDouble); + // previousPagerankVector holds last iteration's pagerank vector + previousPagerankVector = (double *) malloc(numberOfPages * sizeofDouble); + // convergedPagerankVector is the pagerank vector of converged pages only + convergedPagerankVector = (double *) malloc(numberOfPages * sizeofDouble); + // linksFromConvergedPagesPagerankVector holds the partial sum of the + // pagerank vector, that describes effect of the links from converged + // pages to non converged pages + linksFromConvergedPagesPagerankVector = (double *) malloc(numberOfPages * sizeofDouble); + // convergenceMatrix indicates which pages have converged + convergenceMatrix = (bool *) malloc(numberOfPages * sizeof(bool)); + *convergenceStatus = false; + + // Initialization + allocMemoryForCoo(&linksFromConvergedPages, transitionMatrix->numberOfNonZeroElements); + for (int i=0; irowCumulativeIndexes[i], + rowEndIndex = transitionMatrix->rowCumulativeIndexes[i+1]; + if (rowEndIndex > rowStartIndex) { + // This row (page) has non zero elements (out-links) + for (int j=rowStartIndex; jcolumnIndexes[j]; + if (convergenceMatrix[pageLinksTo] == false){ + // Link exists, adds element to the vector + addElement(&linksFromConvergedPages, + transitionMatrix->values[j], i, pageLinksTo); + } + } + } + + // Increases sparsity of the transition matrix by zeroing + // out elements that correspond to converged pages + zeroOutRow(transitionMatrix, i); + zeroOutColumn(transitionMatrix, i); + + // Builds the new linksFromConvergedPagesPagerankVector + cooSparseMatrixVectorMultiplication(linksFromConvergedPages, + *pagerankVector, &linksFromConvergedPagesPagerankVector, + numberOfPages); + } + } + free(newlyConvergedPages); + } + + for(int i=0; i 10) { + validUsage(argumentVector[0]); + } + + (*parameters).numberOfPages = 0; + (*parameters).maxIterations = 0; + (*parameters).convergenceCriterion = 1; + (*parameters).dampingFactor = 0.85; + (*parameters).verbose = false; + (*parameters).history = false; + (*parameters).outputFilename = DEFAULT_OUTPUT_FILENAME; + + char *endPointer; + int argumentIndex = 1; + + while (argumentIndex < argumentCount) { + if (!strcmp(argumentVector[argumentIndex], ARGUMENT_CONVERGENCE_TOLERANCE)) { + argumentIndex = checkIncrement(argumentIndex, argumentCount, argumentVector[0]); + + double convergenceInput = strtod(argumentVector[argumentIndex], &endPointer); + if (convergenceInput == 0) { + printf("Invalid convergence argument\n"); + exit(EXIT_FAILURE); + } + (*parameters).convergenceCriterion = convergenceInput; + } else if (!strcmp(argumentVector[argumentIndex], ARGUMENT_MAX_ITERATIONS)) { + argumentIndex = checkIncrement(argumentIndex, argumentCount, argumentVector[0]); + + size_t iterationsInput = strtol(argumentVector[argumentIndex], &endPointer, NUMERICAL_BASE); + if (iterationsInput == 0 && endPointer) { + printf("Invalid iterations argument\n"); + exit(EXIT_FAILURE); + } + (*parameters).maxIterations = iterationsInput; + } else if (!strcmp(argumentVector[argumentIndex], ARGUMENT_DAMPING_FACTOR)) { + argumentIndex = checkIncrement(argumentIndex, argumentCount, argumentVector[0]); + + double alphaInput = strtod(argumentVector[argumentIndex], &endPointer); + if ((alphaInput == 0 || alphaInput > 1) && endPointer) { + printf("Invalid alpha argument\n"); + exit(EXIT_FAILURE); + } + (*parameters).dampingFactor = alphaInput; + } else if (!strcmp(argumentVector[argumentIndex], ARGUMENT_VERBAL_OUTPUT)) { + (*parameters).verbose = true; + } else if (!strcmp(argumentVector[argumentIndex], ARGUMENT_OUTPUT_HISTORY)) { + (*parameters).history = true; + } else if (!strcmp(argumentVector[argumentIndex], ARGUMENT_OUTPUT_FILENAME)) { + argumentIndex = checkIncrement(argumentIndex, argumentCount, argumentVector[0]); + + if (fopen(argumentVector[argumentIndex], "w") == NULL) { + printf("Invalid output filename. Reverting to default.\n"); + continue; + } + (*parameters).outputFilename = argumentVector[argumentIndex]; + } else if (argumentIndex == argumentCount - 1) { + (*parameters).graphFilename = argumentVector[argumentIndex]; + } else { + validUsage(argumentVector[0]); + exit(EXIT_FAILURE); + } + ++argumentIndex; + } +} + +/* + * readGraphFromFile loads the file supplied in the command line arguments to an + * array (directedWebGraph) that represents the graph. +*/ +void generateNormalizedTransitionMatrixFromFile(CsrSparseMatrix *transitionMatrix, + Parameters *parameters){ + FILE *graphFile; + + // Opens the file for reading + graphFile = fopen((*parameters).graphFilename, "r+"); + if (!graphFile) { + printf("Error opening file \n"); + exit(EXIT_FAILURE); + } + + char buffer[FILE_READ_BUFFER_SIZE]; + char *readResult; + // Skips the first two lines + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } + + // Third line contains the numbers of nodes and edges + int numberOfNodes = 0, numberOfEdges = 0; + + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } + + // Parses the number of nodes and number of edges + { + // Splits string to whitespace + char *token = strtok(buffer, " "); + bool nextIsNodes = false, nextIsEdges = false; + + while (token != NULL) { + if (strcmp(token, "Nodes:") == 0) { + nextIsNodes = true; + } else if (nextIsNodes) { + numberOfNodes = atoi(token); + nextIsNodes = false; + } else if (strcmp(token, "Edges:") == 0) { + nextIsEdges = true; + } else if (nextIsEdges) { + numberOfEdges = atoi(token); + break; + } + + // Gets next string token + token = strtok (NULL, " ,.-"); + } + } + + if ((*parameters).verbose) { + printf("File claims number of pages is: %d\nThe number of edges is: %d\n", + numberOfNodes, numberOfEdges); + } + + // Skips the fourth line + readResult = fgets(buffer, 512, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } + + + int maxPageIndex = 0; + CooSparseMatrix tempMatrix = initCooSparseMatrix(); + allocMemoryForCoo(&tempMatrix, numberOfEdges); + + for (int i=0; i maxPageIndex) { + maxPageIndex = fileFrom; + } + if (fileTo > maxPageIndex) { + maxPageIndex = fileTo; + } + addElement(&tempMatrix, 1, fileFrom, fileTo); + } + + if ((*parameters).verbose) { + printf("Max page index found is: %d\n", maxPageIndex); + } + (*parameters).numberOfPages = maxPageIndex + 1; + + // Calculates the outdegree of each page and assigns the uniform probability + // of transition to the elements of the corresponding row + int* pageOutdegree = malloc((*parameters).numberOfPages*sizeof(int)); + for (int i=0; i<(*parameters).numberOfPages; ++i){ + pageOutdegree[i] = 0; + } + + for (int i=0; irowIndex; + ++pageOutdegree[currentRow]; + } + + for (int i=0; ivalue = 1./pageOutdegree[tempMatrix.elements[i]->rowIndex]; + } + free(pageOutdegree); + + // Transposes the temporary transition matrix (P^T). + transposeSparseMatrix(&tempMatrix); + + allocMemoryForCsr(transitionMatrix, numberOfEdges); + // Transforms the temporary COO matrix to the desired CSR format + transformToCSR(tempMatrix, transitionMatrix); + destroyCooSparseMatrix(&tempMatrix); + + fclose(graphFile); +} + +/* + * validUsage outputs a message to the console that informs the user of the + * correct (valid) way to use the program. +*/ +void validUsage(char *programName) { + printf("%s [-c convergence_criterion] [-m max_iterations] [-a alpha] [-v] [-h] [-o output_filename] " \ + "\n-c convergence_criterion" \ + "\n\tthe convergence tolerance criterion" \ + "\n-m max_iterations" \ + "\n\tmaximum number of iterations to perform" \ + "\n-a alpha" \ + "\n\tthe damping factor" \ + "\n-v enable verbal output" \ + "\n-h enable history output to file" \ + "\n-o output_filename" \ + "\n\tfilename and path for the output" \ + "\n", programName); + exit(EXIT_FAILURE); +} + +/* + * checkIncrement is a helper function for parseArguments function. +*/ +int checkIncrement(int previousIndex, int maxIndex, char *programName) { + if (previousIndex == maxIndex) { + validUsage(programName); + exit(EXIT_FAILURE); + } + return ++previousIndex; +} + +void savePagerankToFile(char *filename, bool append, double *pagerankVector, + int vectorSize, int* iterations, int maxIterationsForConvergence) { + FILE *outputFile; + + if (append) { + outputFile = fopen(filename, "a"); + } else { + outputFile = fopen(filename, "w"); + } + + if (outputFile == NULL) { + printf("Error while opening the output file.\n"); + return; + } + + + if(append){ + double sum = 0; + for (int i=0; i +#include +#include +#include +#include + +#include "coo_sparse_matrix.h" + +/* ===== DEFINITIONS ===== */ + +//Colors used for better console output formating. +#define ANSI_COLOR_RED "\x1B[31m" +#define ANSI_COLOR_GREEN "\x1B[32m" +#define ANSI_COLOR_YELLOW "\x1B[33m" +#define ANSI_COLOR_BLUE "\x1B[34m" +#define ANSI_COLOR_CYAN "\x1B[36m" +#define ANSI_COLOR_RESET "\x1B[0m" + +/* ===== CONSTANTS DEFINITION ===== */ + +// Constant strings that store the command line options available. +extern const char *ARGUMENT_CONVERGENCE_TOLERANCE; +extern const char *ARGUMENT_MAX_ITERATIONS; +extern const char *ARGUMENT_DAMPING_FACTOR; +extern const char *ARGUMENT_VERBAL_OUTPUT; +extern const char *ARGUMENT_OUTPUT_HISTORY; +extern const char *ARGUMENT_OUTPUT_FILENAME; +// The numerical base used when parsing numerical command line arguments. +extern const int NUMERICAL_BASE; +// Default filename used for the output. +extern char *DEFAULT_OUTPUT_FILENAME; +// The size of the buffer used for reading the graph input file. +extern const int FILE_READ_BUFFER_SIZE; + +/* ===== STRUCTURES ===== */ + +// A data structure to conveniently hold the algorithm's parameters. +typedef struct parameters { + int numberOfPages, maxIterations; + double convergenceCriterion, dampingFactor; + bool verbose, history; + char *outputFilename, *graphFilename; +} Parameters; + +/* ===== FUNCTION DEFINITIONS ===== */ + +// Function validUsage outputs the correct way to use the program with command +// line arguments. +void validUsage(char *programName); + +// Function checkIncrement is a helper function used in parseArguments (see +// bellow). +int checkIncrement(int previousIndex, int maxIndex, char *programName); + +// Function parseArguments parses command line arguments. +void parseArguments(int argumentCount, char **argumentVector, + Parameters *parameters); + +// Function generateNormalizedTransitionMatrixFromFile reads through the entries +// of the file specified in the arguments (parameters->graphFilename), using +// them to populate the sparse array (transitionMatrix). The entries of the file +// represent the edges of the web transition graph. The entries are then +// modified to become the rows of the transition matrix. +void generateNormalizedTransitionMatrixFromFile(CsrSparseMatrix *transitionMatrix, + Parameters *parameters); + +// Function savePagerankToFile appends or overwrites the pagerank vector +// "pagerankVector" to the file with the filename supplied in the arguments. +void savePagerankToFile(char *filename, bool append, double *pagerankVector, + int vectorSize, int* iterations, int maxIterationsForConvergence); + +// Function initialize allocates memory for the pagerank vector, reads the +// dataset from the file and creates the transition probability distribution +// matrix. +void initialize(CsrSparseMatrix *transitionMatrix, double **pagerankVector, + Parameters *parameters); + +// Function vectorNorm calculates the first norm of a vector. +double vectorNorm(double *vector, int vectorSize); + +// Function calculateNextPagerank calculates the next pagerank vector. +void calculateNextPagerank(CsrSparseMatrix *transitionMatrix, + double *previousPagerankVector, double **pagerankVector, + double *linksFromConvergedPagesPagerankVector, + double *convergedPagerankVector, int vectorSize, double dampingFactor); + +// Function pagerank iteratively calculates the pagerank of each page until +// either the convergence criterion is met or the maximum number of iterations +// is reached. +int* pagerank(CsrSparseMatrix *transitionMatrix, double **pagerankVector, + bool *convergenceStatus, Parameters parameters, int* maxIterationsForConvergence); + +#endif // SERIAL_GS_PAGERANK_FUNCTIONS_H \ No newline at end of file