diff --git a/datasets/Stanford Large Network Dataset Collection/README.md b/datasets/Stanford Large Network Dataset Collection/README.md index 4f4da65..9723267 100644 --- a/datasets/Stanford Large Network Dataset Collection/README.md +++ b/datasets/Stanford Large Network Dataset Collection/README.md @@ -5,19 +5,4 @@ More details about the datasets can be found in the table bellow. | Dataset directory | Description | Nodes | Edges | URL link | | ----------- | ----------- | ----------- | ----------- | ----------- | | web-Google | "web-Google" | 875,713 | 5,105,039 | [link](https://snap.stanford.edu/data/web-Google.html) | -| wiki-Talk | "wiki-Talk" | 2,394,385 | 5,021,410 | [link](https://snap.stanford.edu/data/wiki-Talk.html) | - -### Adjustments made to the datasets: -The datasets had four (4) lines of meta-data at the beginning of the files and the data were saved in a form that had one edge per line following the pattern "`linkFrom\tlinkTo\n`", like so: -``` -linkFrom linkTo -linkFrom linkTo -... -``` -A program in C was written to discard the meta-data lines and transform the pattern in a new one that has all the out-links of a page in a single row, like so: -``` -page_1: linkTo_1 linkTo_2 linkTo_3 ... -page_2: linkTo_1 ... -``` - -The program is provided in this repository, under the pathname: `/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c`. \ No newline at end of file +| wiki-Talk | "wiki-Talk" | 2,394,385 | 5,021,410 | [link](https://snap.stanford.edu/data/wiki-Talk.html) | \ No newline at end of file diff --git a/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c b/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c deleted file mode 100644 index 4ec0eb0..0000000 --- a/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c +++ /dev/null @@ -1,90 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -typedef struct pair { - int from; - int to; -} Pair; - -int main(int argc, char const *argv[]) { - if (argc != 2) { - printf("Usage: ./graphToAdjacencyList <graphFile>\n"); - exit(EXIT_FAILURE); - } - - FILE *graphFile = fopen(argv[1], "r"); - if (!graphFile) { - printf("Error opening file \n"); - exit(EXIT_FAILURE); - } - - char buffer[512]; - // Skips the first two lines - fgets(buffer, 512, graphFile); - fgets(buffer, 512, graphFile); - - // Third line has the numbers of nodes and edges, we need to parse them. - int numberOfNodes = 0, numberOfEdges; - fgets(buffer, 512, graphFile); - - char *token = strtok(buffer, " "); - int getNodes = 0, getEdges = 0; - - while (token != NULL) { - if (strcmp(token, "Nodes:") == 0) { - getNodes = 1; - } else if (getNodes == 1) { - numberOfNodes = atoi(token); - getNodes = 0; - } else if (strcmp(token, "Edges:") == 0) { - getEdges = 1; - } else if (getEdges == 1) { - numberOfEdges = atoi(token); - break; - } - token = strtok (NULL, " ,.-"); - } - - // Skips the fourth line - fgets(buffer, 512, graphFile); - - Pair **edges = (Pair **) malloc(numberOfEdges * sizeof(Pair *)); - printf("Reading edges from file...\n"); - for (int i=0; i<numberOfEdges; i++) { - edges[i] = (Pair *) malloc(sizeof(Pair)); - - int f_from = 0, f_to = 0; - if (!fscanf(graphFile, "%d %d", &f_from, &f_to)) { - break; - } - - edges[i]->from = f_from; - edges[i]->to = f_to; - } - - - FILE *adjacentListFile = fopen("adjacentListCreated", "w"); - int index = 0; - - printf("\nWriting nodes to file...\n"); - for (int i=0; i<numberOfNodes; ++i) { - int hasOutlinks = 0; - - fprintf(adjacentListFile, "%d: ", i); - - while (index < numberOfEdges && edges[index]->from == i) { - fprintf(adjacentListFile, "%d ", edges[index]->to); - if (!hasOutlinks) hasOutlinks = 1; - ++index; - } - - if (!hasOutlinks) { - fprintf(adjacentListFile, "-1 "); - } - - fprintf(adjacentListFile, "\n"); - } - - return 0; -} \ No newline at end of file diff --git a/datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz b/datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz index fa398cf..9922d3a 100644 Binary files a/datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz and b/datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz differ diff --git a/datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz b/datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz index 25cef55..84d497f 100644 Binary files a/datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz and b/datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz differ diff --git a/serial/serial_gs_pagerank.c b/serial/serial_gs_pagerank.c index 355915f..fdb7031 100644 --- a/serial/serial_gs_pagerank.c +++ b/serial/serial_gs_pagerank.c @@ -9,6 +9,7 @@ double seq_time; int main(int argc, char **argv) { SparseMatrix transitionMatrix; double *pagerankVector; + bool convergenceStatus; Parameters parameters; transitionMatrix = createSparseMatrix(); @@ -20,10 +21,17 @@ int main(int argc, char **argv) { // Starts wall-clock timer gettimeofday (&startwtime, NULL); - int iterations = pagerank(&transitionMatrix, &pagerankVector, parameters); + int iterations = pagerank(&transitionMatrix, &pagerankVector, + &convergenceStatus, parameters); if (parameters.verbose) { - printf("\n----- Results -----\ - \nTotal iterations = %d\n", iterations); + printf(ANSI_COLOR_YELLOW "\n----- RESULTS -----\n" ANSI_COLOR_RESET); + if (convergenceStatus) { + printf(ANSI_COLOR_GREEN "Pagerank converged after %d iterations!\n" \ + ANSI_COLOR_RESET, iterations); + } else { + printf(ANSI_COLOR_RED "Pagerank did not converge after max number of" \ + " iterations (%d) was reached!\n" ANSI_COLOR_RESET, iterations); + } } // Stops wall-clock timer diff --git a/serial/serial_gs_pagerank_functions.c b/serial/serial_gs_pagerank_functions.c index 99ef587..c81f050 100644 --- a/serial/serial_gs_pagerank_functions.c +++ b/serial/serial_gs_pagerank_functions.c @@ -9,11 +9,12 @@ const char *ARGUMENT_OUTPUT_FILENAME = "-o"; const int NUMERICAL_BASE = 10; char *DEFAULT_OUTPUT_FILENAME = "pagerank_output"; -const int MAX_PAGE_LINKS_TEXT_SIZE = 4096; +const int FILE_READ_BUFFER_SIZE = 4096; // ==================== PAGERANK ==================== -int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters parameters) { +int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, + bool *convergenceStatus, Parameters parameters) { int iterations = 0; double delta, *vectorDifference = (double *) malloc(parameters.numberOfPages * sizeof(double)), @@ -22,6 +23,7 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters **linksFromConvergedPages = (double **) malloc(parameters.numberOfPages * sizeof(double *)), *linksFromConvergedPagesPagerankVector = (double *) malloc(parameters.numberOfPages * sizeof(double)); bool *converganceMatrix = (bool *) malloc(parameters.numberOfPages * sizeof(bool)); + *convergenceStatus = false; for (int i=0; i<parameters.numberOfPages; ++i) { convergedPagerankVector[i] = 0; @@ -35,7 +37,7 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters } if (parameters.verbose) { - printf("\n----- Starting iterations -----\n"); + printf(ANSI_COLOR_YELLOW "\n----- Starting iterations -----\n" ANSI_COLOR_RESET); } do { @@ -57,6 +59,9 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters vectorDifference[i] = (*pagerankVector)[i] - previousPagerankVector[i]; } delta = vectorNorm(vectorDifference, parameters.numberOfPages); + if (delta < parameters.convergenceCriterion) { + *convergenceStatus = true; + } if (iterations && !iterations % 10) { for (int i=0; i<parameters.numberOfPages; ++i) { @@ -88,8 +93,12 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters } ++iterations; - printf("Iteration %d: delta = %f\n", iterations, delta); - } while (delta > parameters.convergenceCriterion && + if (iterations%2) { + printf(ANSI_COLOR_BLUE "Iteration %d: delta = %f\n" ANSI_COLOR_RESET, iterations, delta); + } else { + printf(ANSI_COLOR_CYAN "Iteration %d: delta = %f\n" ANSI_COLOR_RESET, iterations, delta); + } + } while (!*convergenceStatus && (parameters.maxIterations == 0 || iterations < parameters.maxIterations)); if (!parameters.history) { @@ -112,22 +121,22 @@ void initialize(SparseMatrix *transitionMatrix, // Reads web graph from file if ((*parameters).verbose) { - printf("----- Reading graph from file -----\n"); + printf(ANSI_COLOR_YELLOW "----- Reading graph from file -----\n" ANSI_COLOR_RESET); } generateNormalizedTransitionMatrixFromFile(transitionMatrix, parameters); // Outputs the algorithm parameters to the console if ((*parameters).verbose) { - printf("\n----- Running with parameters -----\ - \nNumber of pages: %d", (*parameters).numberOfPages); + printf(ANSI_COLOR_YELLOW "\n----- Running with parameters -----\n" ANSI_COLOR_RESET\ + "Number of pages: %d", (*parameters).numberOfPages); if (!(*parameters).maxIterations) { printf("\nMaximum number of iterations: inf"); } else { printf("\nMaximum number of iterations: %d", (*parameters).maxIterations); } - printf("\nConvergence criterion: %f\ - \nDamping factor: %f\ - \nGraph filename: %s\n", (*parameters).convergenceCriterion, + printf("\nConvergence criterion: %f" \ + "\nDamping factor: %f" \ + "\nGraph filename: %s\n", (*parameters).convergenceCriterion, (*parameters).dampingFactor, (*parameters).graphFilename); } @@ -266,37 +275,88 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix, exit(EXIT_FAILURE); } - int pageIndex, count = 0; - while (fscanf(graphFile, "%d:", &pageIndex) != EOF) { - if (!(pageIndex%51050)) { - printf("\t%d\t%d%%\n", pageIndex, ++count); - } + char buffer[FILE_READ_BUFFER_SIZE]; + char *readResult; + // Skips the first two lines + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } - char *restOfLine = malloc(MAX_PAGE_LINKS_TEXT_SIZE); - if (!fgets(restOfLine, MAX_PAGE_LINKS_TEXT_SIZE, graphFile)) { - exit(EXIT_FAILURE); - } + // Third line contains the numbers of nodes and edges + int numberOfNodes = 0, numberOfEdges; + + readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } - char *token = strtok(restOfLine, " "); + // Parses the number of nodes and number of edges + { + // Splits string to whitespace + char *token = strtok(buffer, " "); + bool nextIsNodes = false, nextIsEdges = false; while (token != NULL) { - if (strcmp(token, "\n") == 0) { - //token = strtok (NULL, " "); + if (strcmp(token, "Nodes:") == 0) { + nextIsNodes = true; + } else if (nextIsNodes) { + numberOfNodes = atoi(token); + nextIsNodes = false; + } else if (strcmp(token, "Edges:") == 0) { + nextIsEdges = true; + } else if (nextIsEdges) { + numberOfEdges = atoi(token); break; } - int outLink = atoi(token); - if (outLink != -1) { - apendElement(transitionMatrix, 1, pageIndex, outLink); - } - token = strtok (NULL, " "); + // Gets next string token + token = strtok (NULL, " ,.-"); } } - printf("\t100%%\n"); - printf("number of edges = %d\n", transitionMatrix->elements); - (*parameters).numberOfPages = pageIndex + 1; + if ((*parameters).verbose) { + printf("The number of pages is: %d\nThe number of edges is: %d\n", + numberOfNodes, numberOfEdges); + } + (*parameters).numberOfPages = numberOfNodes; + + // Skips the fourth line + readResult = fgets(buffer, 512, graphFile); + if (readResult == NULL) { + printf("Error while reading from the file. Does the file have the correct format?\n"); + exit(EXIT_FAILURE); + } + + printf("SIZE OF STRUCT = %lu Bytes\n", sizeof(SparseMatrixElement)); + + int fivePercentIncrements = (int) numberOfEdges/20; + fivePercentIncrements = fivePercentIncrements != 0 ? fivePercentIncrements : 1; + + for (int i=0; i<numberOfEdges; i++) { + if (((*parameters).verbose) && ((i % fivePercentIncrements) == 0)) { + int percentage = (i/fivePercentIncrements)*5; + printf("%d%% done", percentage); + if (percentage%20 == 0) { + printf("\n"); + } else { + printf(" •••• "); + } + } + + int fileFrom = 0, fileTo = 0; + if (!fscanf(graphFile, "%d %d", &fileFrom, &fileTo)) { + break; + } + + apendElement(transitionMatrix, 1, fileFrom, fileTo); + } + // Calculates the outdegree of each page and assigns the uniform probability + // of transition to the elements of the corresponding row int currentRow = transitionMatrix->firstElement->rowIndex; SparseMatrixElement *startElement = transitionMatrix->firstElement; while(true) { @@ -315,9 +375,10 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix, // Assigns the value 1/outdegree to current page's columns currentElement = startElement; + double pageUniformProbability = 1. / pageOutdegree; for (int i=0; i<pageOutdegree; ++i) { if (currentElement->rowIndex == currentRow) { - currentElement->value = 1. / pageOutdegree; + currentElement->value = pageUniformProbability; currentElement = currentElement->nextElement; } else { break; @@ -341,18 +402,18 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix, * correct (valid) way to use the program. */ void validUsage(char *programName) { - printf("%s [-c convergence_criterion] [-m max_iterations] [-a alpha] [-v] [-h] [-o output_filename] <graph_file>\ - \n-c convergence_criterion\ - \n\tthe convergence tolerance criterion\ - \n-m max_iterations\ - \n\tmaximum number of iterations to perform\ - \n-a alpha\ - \n\tthe damping factor\ - \n-v enable verbal output\ - \n-h enable history output to file\ - \n-o output_filename\ - \n\tfilename and path for the output\ - \n", programName); + printf("%s [-c convergence_criterion] [-m max_iterations] [-a alpha] [-v] [-h] [-o output_filename] <graph_file>" \ + "\n-c convergence_criterion" \ + "\n\tthe convergence tolerance criterion" \ + "\n-m max_iterations" \ + "\n\tmaximum number of iterations to perform" \ + "\n-a alpha" \ + "\n\tthe damping factor" \ + "\n-v enable verbal output" \ + "\n-h enable history output to file" \ + "\n-o output_filename" \ + "\n\tfilename and path for the output" \ + "\n", programName); exit(EXIT_FAILURE); } diff --git a/serial/serial_gs_pagerank_functions.h b/serial/serial_gs_pagerank_functions.h index 853f8a0..9e5267c 100644 --- a/serial/serial_gs_pagerank_functions.h +++ b/serial/serial_gs_pagerank_functions.h @@ -9,6 +9,13 @@ #include "sparse_matrix.h" +#define ANSI_COLOR_RED "\x1B[31m" +#define ANSI_COLOR_GREEN "\x1B[32m" +#define ANSI_COLOR_YELLOW "\x1B[33m" +#define ANSI_COLOR_BLUE "\x1B[34m" +#define ANSI_COLOR_CYAN "\x1B[36m" +#define ANSI_COLOR_RESET "\x1B[0m" + /* * Constant strings that store the command line options available. */ @@ -79,6 +86,7 @@ void matrixVectorMultiplication(SparseMatrix *transitionMatrix, // Function pagerank iteratively calculates the pagerank of each page until // either the convergence criterion is met or the maximum number of iterations // is reached. -int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters parameters); +int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, + bool *convergenceStatus, Parameters parameters); #endif // SERIAL_GS_PAGERANK_FUNCTIONS_H \ No newline at end of file diff --git a/serial/sparse_matrix.c b/serial/sparse_matrix.c index ba77587..2df1b0d 100644 --- a/serial/sparse_matrix.c +++ b/serial/sparse_matrix.c @@ -4,6 +4,7 @@ SparseMatrix createSparseMatrix() { SparseMatrix sparseMatrix; sparseMatrix.elements = 0; sparseMatrix.firstElement = NULL; + sparseMatrix.lastElement = NULL; return sparseMatrix; } @@ -18,14 +19,13 @@ void apendElement(SparseMatrix *sparseMatrix, double value, int row, int column) if (sparseMatrix->firstElement == NULL) { // Sparse matrix is empty, this is the first element sparseMatrix->firstElement = newElement; + sparseMatrix->lastElement = newElement; } else { //Gets last element of the matrix - SparseMatrixElement *lastElement = sparseMatrix->firstElement; - while (lastElement->nextElement != NULL) { - lastElement = lastElement->nextElement; - } + SparseMatrixElement *lastElement = sparseMatrix->lastElement; lastElement->nextElement = newElement; + sparseMatrix->lastElement = newElement; } sparseMatrix->elements = sparseMatrix->elements + 1; @@ -39,6 +39,7 @@ bool deleteElement(SparseMatrix *sparseMatrix, int row, int column) { // Matrix has one element. Deletes it. free(sparseMatrix->firstElement); sparseMatrix->firstElement = NULL; + sparseMatrix->lastElement = NULL; sparseMatrix->elements = sparseMatrix->elements - 1; return true; } @@ -58,6 +59,9 @@ bool deleteElement(SparseMatrix *sparseMatrix, int row, int column) { SparseMatrixElement *nextElement = currentElement->nextElement; if (nextElement->rowIndex == row && nextElement->columnIndex == column) { currentElement->nextElement = nextElement->nextElement; + if (currentElement->nextElement == NULL) { + sparseMatrix->lastElement = currentElement; + } free(nextElement); sparseMatrix->elements = sparseMatrix->elements - 1; return true; diff --git a/serial/sparse_matrix.h b/serial/sparse_matrix.h index cfa7f8e..7a977d2 100644 --- a/serial/sparse_matrix.h +++ b/serial/sparse_matrix.h @@ -15,6 +15,7 @@ typedef struct sparseMatrixElement { typedef struct sparseMatrix { int elements; SparseMatrixElement *firstElement; + SparseMatrixElement *lastElement; } SparseMatrix; SparseMatrix createSparseMatrix();