Browse Source

Fix for fast file reading, Console output improvements

master
Apostolos Fanakis 6 years ago
parent
commit
607838656e
No known key found for this signature in database GPG Key ID: 56CE2DEDE9F1FB78
  1. 15
      datasets/Stanford Large Network Dataset Collection/README.md
  2. 90
      datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c
  3. BIN
      datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz
  4. BIN
      datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz
  5. 14
      serial/serial_gs_pagerank.c
  6. 149
      serial/serial_gs_pagerank_functions.c
  7. 10
      serial/serial_gs_pagerank_functions.h
  8. 12
      serial/sparse_matrix.c
  9. 1
      serial/sparse_matrix.h

15
datasets/Stanford Large Network Dataset Collection/README.md

@ -6,18 +6,3 @@ More details about the datasets can be found in the table bellow.
| ----------- | ----------- | ----------- | ----------- | ----------- |
| web-Google | "web-Google" | 875,713 | 5,105,039 | [link](https://snap.stanford.edu/data/web-Google.html) |
| wiki-Talk | "wiki-Talk" | 2,394,385 | 5,021,410 | [link](https://snap.stanford.edu/data/wiki-Talk.html) |
### Adjustments made to the datasets:
The datasets had four (4) lines of meta-data at the beginning of the files and the data were saved in a form that had one edge per line following the pattern "`linkFrom\tlinkTo\n`", like so:
```
linkFrom linkTo
linkFrom linkTo
...
```
A program in C was written to discard the meta-data lines and transform the pattern in a new one that has all the out-links of a page in a single row, like so:
```
page_1: linkTo_1 linkTo_2 linkTo_3 ...
page_2: linkTo_1 ...
```
The program is provided in this repository, under the pathname: `/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c`.

90
datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c

@ -1,90 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct pair {
int from;
int to;
} Pair;
int main(int argc, char const *argv[]) {
if (argc != 2) {
printf("Usage: ./graphToAdjacencyList <graphFile>\n");
exit(EXIT_FAILURE);
}
FILE *graphFile = fopen(argv[1], "r");
if (!graphFile) {
printf("Error opening file \n");
exit(EXIT_FAILURE);
}
char buffer[512];
// Skips the first two lines
fgets(buffer, 512, graphFile);
fgets(buffer, 512, graphFile);
// Third line has the numbers of nodes and edges, we need to parse them.
int numberOfNodes = 0, numberOfEdges;
fgets(buffer, 512, graphFile);
char *token = strtok(buffer, " ");
int getNodes = 0, getEdges = 0;
while (token != NULL) {
if (strcmp(token, "Nodes:") == 0) {
getNodes = 1;
} else if (getNodes == 1) {
numberOfNodes = atoi(token);
getNodes = 0;
} else if (strcmp(token, "Edges:") == 0) {
getEdges = 1;
} else if (getEdges == 1) {
numberOfEdges = atoi(token);
break;
}
token = strtok (NULL, " ,.-");
}
// Skips the fourth line
fgets(buffer, 512, graphFile);
Pair **edges = (Pair **) malloc(numberOfEdges * sizeof(Pair *));
printf("Reading edges from file...\n");
for (int i=0; i<numberOfEdges; i++) {
edges[i] = (Pair *) malloc(sizeof(Pair));
int f_from = 0, f_to = 0;
if (!fscanf(graphFile, "%d %d", &f_from, &f_to)) {
break;
}
edges[i]->from = f_from;
edges[i]->to = f_to;
}
FILE *adjacentListFile = fopen("adjacentListCreated", "w");
int index = 0;
printf("\nWriting nodes to file...\n");
for (int i=0; i<numberOfNodes; ++i) {
int hasOutlinks = 0;
fprintf(adjacentListFile, "%d: ", i);
while (index < numberOfEdges && edges[index]->from == i) {
fprintf(adjacentListFile, "%d ", edges[index]->to);
if (!hasOutlinks) hasOutlinks = 1;
++index;
}
if (!hasOutlinks) {
fprintf(adjacentListFile, "-1 ");
}
fprintf(adjacentListFile, "\n");
}
return 0;
}

BIN
datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz

Binary file not shown.

BIN
datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz

Binary file not shown.

14
serial/serial_gs_pagerank.c

@ -9,6 +9,7 @@ double seq_time;
int main(int argc, char **argv) {
SparseMatrix transitionMatrix;
double *pagerankVector;
bool convergenceStatus;
Parameters parameters;
transitionMatrix = createSparseMatrix();
@ -20,10 +21,17 @@ int main(int argc, char **argv) {
// Starts wall-clock timer
gettimeofday (&startwtime, NULL);
int iterations = pagerank(&transitionMatrix, &pagerankVector, parameters);
int iterations = pagerank(&transitionMatrix, &pagerankVector,
&convergenceStatus, parameters);
if (parameters.verbose) {
printf("\n----- Results -----\
\nTotal iterations = %d\n", iterations);
printf(ANSI_COLOR_YELLOW "\n----- RESULTS -----\n" ANSI_COLOR_RESET);
if (convergenceStatus) {
printf(ANSI_COLOR_GREEN "Pagerank converged after %d iterations!\n" \
ANSI_COLOR_RESET, iterations);
} else {
printf(ANSI_COLOR_RED "Pagerank did not converge after max number of" \
" iterations (%d) was reached!\n" ANSI_COLOR_RESET, iterations);
}
}
// Stops wall-clock timer

149
serial/serial_gs_pagerank_functions.c

@ -9,11 +9,12 @@ const char *ARGUMENT_OUTPUT_FILENAME = "-o";
const int NUMERICAL_BASE = 10;
char *DEFAULT_OUTPUT_FILENAME = "pagerank_output";
const int MAX_PAGE_LINKS_TEXT_SIZE = 4096;
const int FILE_READ_BUFFER_SIZE = 4096;
// ==================== PAGERANK ====================
int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters parameters) {
int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector,
bool *convergenceStatus, Parameters parameters) {
int iterations = 0;
double delta,
*vectorDifference = (double *) malloc(parameters.numberOfPages * sizeof(double)),
@ -22,6 +23,7 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters
**linksFromConvergedPages = (double **) malloc(parameters.numberOfPages * sizeof(double *)),
*linksFromConvergedPagesPagerankVector = (double *) malloc(parameters.numberOfPages * sizeof(double));
bool *converganceMatrix = (bool *) malloc(parameters.numberOfPages * sizeof(bool));
*convergenceStatus = false;
for (int i=0; i<parameters.numberOfPages; ++i) {
convergedPagerankVector[i] = 0;
@ -35,7 +37,7 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters
}
if (parameters.verbose) {
printf("\n----- Starting iterations -----\n");
printf(ANSI_COLOR_YELLOW "\n----- Starting iterations -----\n" ANSI_COLOR_RESET);
}
do {
@ -57,6 +59,9 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters
vectorDifference[i] = (*pagerankVector)[i] - previousPagerankVector[i];
}
delta = vectorNorm(vectorDifference, parameters.numberOfPages);
if (delta < parameters.convergenceCriterion) {
*convergenceStatus = true;
}
if (iterations && !iterations % 10) {
for (int i=0; i<parameters.numberOfPages; ++i) {
@ -88,8 +93,12 @@ int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters
}
++iterations;
printf("Iteration %d: delta = %f\n", iterations, delta);
} while (delta > parameters.convergenceCriterion &&
if (iterations%2) {
printf(ANSI_COLOR_BLUE "Iteration %d: delta = %f\n" ANSI_COLOR_RESET, iterations, delta);
} else {
printf(ANSI_COLOR_CYAN "Iteration %d: delta = %f\n" ANSI_COLOR_RESET, iterations, delta);
}
} while (!*convergenceStatus &&
(parameters.maxIterations == 0 || iterations < parameters.maxIterations));
if (!parameters.history) {
@ -112,22 +121,22 @@ void initialize(SparseMatrix *transitionMatrix,
// Reads web graph from file
if ((*parameters).verbose) {
printf("----- Reading graph from file -----\n");
printf(ANSI_COLOR_YELLOW "----- Reading graph from file -----\n" ANSI_COLOR_RESET);
}
generateNormalizedTransitionMatrixFromFile(transitionMatrix, parameters);
// Outputs the algorithm parameters to the console
if ((*parameters).verbose) {
printf("\n----- Running with parameters -----\
\nNumber of pages: %d", (*parameters).numberOfPages);
printf(ANSI_COLOR_YELLOW "\n----- Running with parameters -----\n" ANSI_COLOR_RESET\
"Number of pages: %d", (*parameters).numberOfPages);
if (!(*parameters).maxIterations) {
printf("\nMaximum number of iterations: inf");
} else {
printf("\nMaximum number of iterations: %d", (*parameters).maxIterations);
}
printf("\nConvergence criterion: %f\
\nDamping factor: %f\
\nGraph filename: %s\n", (*parameters).convergenceCriterion,
printf("\nConvergence criterion: %f" \
"\nDamping factor: %f" \
"\nGraph filename: %s\n", (*parameters).convergenceCriterion,
(*parameters).dampingFactor, (*parameters).graphFilename);
}
@ -266,37 +275,88 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix,
exit(EXIT_FAILURE);
}
int pageIndex, count = 0;
while (fscanf(graphFile, "%d:", &pageIndex) != EOF) {
if (!(pageIndex%51050)) {
printf("\t%d\t%d%%\n", pageIndex, ++count);
}
char buffer[FILE_READ_BUFFER_SIZE];
char *readResult;
// Skips the first two lines
readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile);
readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile);
if (readResult == NULL) {
printf("Error while reading from the file. Does the file have the correct format?\n");
exit(EXIT_FAILURE);
}
char *restOfLine = malloc(MAX_PAGE_LINKS_TEXT_SIZE);
if (!fgets(restOfLine, MAX_PAGE_LINKS_TEXT_SIZE, graphFile)) {
exit(EXIT_FAILURE);
}
// Third line contains the numbers of nodes and edges
int numberOfNodes = 0, numberOfEdges;
readResult = fgets(buffer, FILE_READ_BUFFER_SIZE, graphFile);
if (readResult == NULL) {
printf("Error while reading from the file. Does the file have the correct format?\n");
exit(EXIT_FAILURE);
}
char *token = strtok(restOfLine, " ");
// Parses the number of nodes and number of edges
{
// Splits string to whitespace
char *token = strtok(buffer, " ");
bool nextIsNodes = false, nextIsEdges = false;
while (token != NULL) {
if (strcmp(token, "\n") == 0) {
//token = strtok (NULL, " ");
if (strcmp(token, "Nodes:") == 0) {
nextIsNodes = true;
} else if (nextIsNodes) {
numberOfNodes = atoi(token);
nextIsNodes = false;
} else if (strcmp(token, "Edges:") == 0) {
nextIsEdges = true;
} else if (nextIsEdges) {
numberOfEdges = atoi(token);
break;
}
int outLink = atoi(token);
if (outLink != -1) {
apendElement(transitionMatrix, 1, pageIndex, outLink);
}
token = strtok (NULL, " ");
// Gets next string token
token = strtok (NULL, " ,.-");
}
}
printf("\t100%%\n");
printf("number of edges = %d\n", transitionMatrix->elements);
(*parameters).numberOfPages = pageIndex + 1;
if ((*parameters).verbose) {
printf("The number of pages is: %d\nThe number of edges is: %d\n",
numberOfNodes, numberOfEdges);
}
(*parameters).numberOfPages = numberOfNodes;
// Skips the fourth line
readResult = fgets(buffer, 512, graphFile);
if (readResult == NULL) {
printf("Error while reading from the file. Does the file have the correct format?\n");
exit(EXIT_FAILURE);
}
printf("SIZE OF STRUCT = %lu Bytes\n", sizeof(SparseMatrixElement));
int fivePercentIncrements = (int) numberOfEdges/20;
fivePercentIncrements = fivePercentIncrements != 0 ? fivePercentIncrements : 1;
for (int i=0; i<numberOfEdges; i++) {
if (((*parameters).verbose) && ((i % fivePercentIncrements) == 0)) {
int percentage = (i/fivePercentIncrements)*5;
printf("%d%% done", percentage);
if (percentage%20 == 0) {
printf("\n");
} else {
printf(" •••• ");
}
}
int fileFrom = 0, fileTo = 0;
if (!fscanf(graphFile, "%d %d", &fileFrom, &fileTo)) {
break;
}
apendElement(transitionMatrix, 1, fileFrom, fileTo);
}
// Calculates the outdegree of each page and assigns the uniform probability
// of transition to the elements of the corresponding row
int currentRow = transitionMatrix->firstElement->rowIndex;
SparseMatrixElement *startElement = transitionMatrix->firstElement;
while(true) {
@ -315,9 +375,10 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix,
// Assigns the value 1/outdegree to current page's columns
currentElement = startElement;
double pageUniformProbability = 1. / pageOutdegree;
for (int i=0; i<pageOutdegree; ++i) {
if (currentElement->rowIndex == currentRow) {
currentElement->value = 1. / pageOutdegree;
currentElement->value = pageUniformProbability;
currentElement = currentElement->nextElement;
} else {
break;
@ -341,18 +402,18 @@ void generateNormalizedTransitionMatrixFromFile(SparseMatrix *transitionMatrix,
* correct (valid) way to use the program.
*/
void validUsage(char *programName) {
printf("%s [-c convergence_criterion] [-m max_iterations] [-a alpha] [-v] [-h] [-o output_filename] <graph_file>\
\n-c convergence_criterion\
\n\tthe convergence tolerance criterion\
\n-m max_iterations\
\n\tmaximum number of iterations to perform\
\n-a alpha\
\n\tthe damping factor\
\n-v enable verbal output\
\n-h enable history output to file\
\n-o output_filename\
\n\tfilename and path for the output\
\n", programName);
printf("%s [-c convergence_criterion] [-m max_iterations] [-a alpha] [-v] [-h] [-o output_filename] <graph_file>" \
"\n-c convergence_criterion" \
"\n\tthe convergence tolerance criterion" \
"\n-m max_iterations" \
"\n\tmaximum number of iterations to perform" \
"\n-a alpha" \
"\n\tthe damping factor" \
"\n-v enable verbal output" \
"\n-h enable history output to file" \
"\n-o output_filename" \
"\n\tfilename and path for the output" \
"\n", programName);
exit(EXIT_FAILURE);
}

10
serial/serial_gs_pagerank_functions.h

@ -9,6 +9,13 @@
#include "sparse_matrix.h"
#define ANSI_COLOR_RED "\x1B[31m"
#define ANSI_COLOR_GREEN "\x1B[32m"
#define ANSI_COLOR_YELLOW "\x1B[33m"
#define ANSI_COLOR_BLUE "\x1B[34m"
#define ANSI_COLOR_CYAN "\x1B[36m"
#define ANSI_COLOR_RESET "\x1B[0m"
/*
* Constant strings that store the command line options available.
*/
@ -79,6 +86,7 @@ void matrixVectorMultiplication(SparseMatrix *transitionMatrix,
// Function pagerank iteratively calculates the pagerank of each page until
// either the convergence criterion is met or the maximum number of iterations
// is reached.
int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector, Parameters parameters);
int pagerank(SparseMatrix *transitionMatrix, double **pagerankVector,
bool *convergenceStatus, Parameters parameters);
#endif // SERIAL_GS_PAGERANK_FUNCTIONS_H

12
serial/sparse_matrix.c

@ -4,6 +4,7 @@ SparseMatrix createSparseMatrix() {
SparseMatrix sparseMatrix;
sparseMatrix.elements = 0;
sparseMatrix.firstElement = NULL;
sparseMatrix.lastElement = NULL;
return sparseMatrix;
}
@ -18,14 +19,13 @@ void apendElement(SparseMatrix *sparseMatrix, double value, int row, int column)
if (sparseMatrix->firstElement == NULL) {
// Sparse matrix is empty, this is the first element
sparseMatrix->firstElement = newElement;
sparseMatrix->lastElement = newElement;
} else {
//Gets last element of the matrix
SparseMatrixElement *lastElement = sparseMatrix->firstElement;
while (lastElement->nextElement != NULL) {
lastElement = lastElement->nextElement;
}
SparseMatrixElement *lastElement = sparseMatrix->lastElement;
lastElement->nextElement = newElement;
sparseMatrix->lastElement = newElement;
}
sparseMatrix->elements = sparseMatrix->elements + 1;
@ -39,6 +39,7 @@ bool deleteElement(SparseMatrix *sparseMatrix, int row, int column) {
// Matrix has one element. Deletes it.
free(sparseMatrix->firstElement);
sparseMatrix->firstElement = NULL;
sparseMatrix->lastElement = NULL;
sparseMatrix->elements = sparseMatrix->elements - 1;
return true;
}
@ -58,6 +59,9 @@ bool deleteElement(SparseMatrix *sparseMatrix, int row, int column) {
SparseMatrixElement *nextElement = currentElement->nextElement;
if (nextElement->rowIndex == row && nextElement->columnIndex == column) {
currentElement->nextElement = nextElement->nextElement;
if (currentElement->nextElement == NULL) {
sparseMatrix->lastElement = currentElement;
}
free(nextElement);
sparseMatrix->elements = sparseMatrix->elements - 1;
return true;

1
serial/sparse_matrix.h

@ -15,6 +15,7 @@ typedef struct sparseMatrixElement {
typedef struct sparseMatrix {
int elements;
SparseMatrixElement *firstElement;
SparseMatrixElement *lastElement;
} SparseMatrix;
SparseMatrix createSparseMatrix();

Loading…
Cancel
Save