Browse Source

Add bigger datasets and tidy folder up

master
Apostolos Fanakis 6 years ago
parent
commit
f8c95e7c63
No known key found for this signature in database GPG Key ID: 56CE2DEDE9F1FB78
  1. 2
      .gitignore
  2. 23
      datasets/Stanford Large Network Dataset Collection/README.md
  3. 90
      datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c
  4. BIN
      datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz
  5. BIN
      datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz
  6. 10
      datasets/University of Toronto/README.md
  7. 0
      datasets/University of Toronto/_architecture/graph/adj_list
  8. 0
      datasets/University of Toronto/_blues/graph/adj_list
  9. 0
      datasets/University of Toronto/_search_engines/graph/adj_list
  10. 0
      datasets/University of Toronto/_weather/graph/adj_list
  11. 36996
      datasets/_architecture/graph/nodes
  12. 26771
      datasets/_blues/graph/nodes
  13. 58296
      datasets/_search_engines/graph/nodes
  14. 40056
      datasets/_weather/graph/nodes
  15. 80
      datasets/list2matrix.c
  16. 5
      datasets/smallset
  17. 1800
      datasets/toronto_death_penalty_refined_adj
  18. BIN
      serial_v2/pagerank.out
  19. 1
      serial_v2/pagerank_output
  20. BIN
      serial_v2/serial_gs_pagerank.o
  21. BIN
      serial_v2/serial_gs_pagerank_functions.o

2
.gitignore

@ -54,4 +54,4 @@ dkms.conf
#Output files #Output files
*_output *_output
adj_matrix adj_matrix

23
datasets/Stanford Large Network Dataset Collection/README.md

@ -0,0 +1,23 @@
The datasets on this folder where downloaded from the website of the Stanford Network Analysis Project (SNAP), found [here](https://snap.stanford.edu/data/).
More details about the datasets can be found in the table bellow.
| Dataset directory | Description | Nodes | Edges | URL link |
| ----------- | ----------- | ----------- | ----------- | ----------- |
| web-Google | "web-Google" | 875,713 | 5,105,039 | [link](https://snap.stanford.edu/data/web-Google.html) |
| wiki-Talk | "wiki-Talk" | 2,394,385 | 5,021,410 | [link](https://snap.stanford.edu/data/wiki-Talk.html) |
### Adjustments made to the datasets:
The datasets had four (4) lines of meta-data at the beginning of the files and the data were saved in a form that had one edge per line following the pattern "`linkFrom\tlinkTo\n`", like so:
```
linkFrom linkTo
linkFrom linkTo
...
```
A program in C was written to discard the meta-data lines and transform the pattern in a new one that has all the out-links of a page in a single row, like so:
```
page_1: linkTo_1 linkTo_2 linkTo_3 ...
page_2: linkTo_1 ...
```
The program is provided in this repository, under the pathname: `/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c`.

90
datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c

@ -0,0 +1,90 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct pair {
int from;
int to;
} Pair;
int main(int argc, char const *argv[]) {
if (argc != 2) {
printf("Usage: ./graphToAdjacencyList <graphFile>\n");
exit(EXIT_FAILURE);
}
FILE *graphFile = fopen(argv[1], "r");
if (!graphFile) {
printf("Error opening file \n");
exit(EXIT_FAILURE);
}
char buffer[512];
// Skips the first two lines
fgets(buffer, 512, graphFile);
fgets(buffer, 512, graphFile);
// Third line has the numbers of nodes and edges, we need to parse them.
int numberOfNodes = 0, numberOfEdges;
fgets(buffer, 512, graphFile);
char *token = strtok(buffer, " ");
int getNodes = 0, getEdges = 0;
while (token != NULL) {
if (strcmp(token, "Nodes:") == 0) {
getNodes = 1;
} else if (getNodes == 1) {
numberOfNodes = atoi(token);
getNodes = 0;
} else if (strcmp(token, "Edges:") == 0) {
getEdges = 1;
} else if (getEdges == 1) {
numberOfEdges = atoi(token);
break;
}
token = strtok (NULL, " ,.-");
}
// Skips the fourth line
fgets(buffer, 512, graphFile);
Pair **edges = (Pair **) malloc(numberOfEdges * sizeof(Pair *));
printf("Reading edges from file...\n");
for (int i=0; i<numberOfEdges; i++) {
edges[i] = (Pair *) malloc(sizeof(Pair));
int f_from = 0, f_to = 0;
if (!fscanf(graphFile, "%d %d", &f_from, &f_to)) {
break;
}
edges[i]->from = f_from;
edges[i]->to = f_to;
}
FILE *adjacentListFile = fopen("adjacentListCreated", "w");
int index = 0;
printf("\nWriting nodes to file...\n");
for (int i=0; i<numberOfNodes; ++i) {
int hasOutlinks = 0;
fprintf(adjacentListFile, "%d: ", i);
while (index < numberOfEdges && edges[index]->from == i) {
fprintf(adjacentListFile, "%d ", edges[index]->to);
if (!hasOutlinks) hasOutlinks = 1;
++index;
}
if (!hasOutlinks) {
fprintf(adjacentListFile, "-1 ");
}
fprintf(adjacentListFile, "\n");
}
return 0;
}

BIN
datasets/Stanford Large Network Dataset Collection/web-Google.tar.xz

Binary file not shown.

BIN
datasets/Stanford Large Network Dataset Collection/wiki-Talk.tar.xz

Binary file not shown.

10
datasets/University of Toronto/README.md

@ -0,0 +1,10 @@
The datasets on this folder where downloaded from the website of the computer science course at University of Toronto, found [here](http://www.cs.toronto.edu/~tsap/experiments/download/download.html).
More details about the datasets can be found in the table bellow.
| Dataset directory | Description | URL link |
| ----------- | ----------- | ----------- |
| \_architecture | "architecture" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_architecture.tar.Z) |
| \_blues | "blues" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_blues.tar.Z) |
| \_search_engines | "search_engines" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_search_engines.tar.Z) |
| \_weather | "weather" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_weather.tar.Z) |

0
datasets/_architecture/graph/adj_list → datasets/University of Toronto/_architecture/graph/adj_list

0
datasets/_blues/graph/adj_list → datasets/University of Toronto/_blues/graph/adj_list

0
datasets/_search_engines/graph/adj_list → datasets/University of Toronto/_search_engines/graph/adj_list

0
datasets/_weather/graph/adj_list → datasets/University of Toronto/_weather/graph/adj_list

36996
datasets/_architecture/graph/nodes

File diff suppressed because it is too large

26771
datasets/_blues/graph/nodes

File diff suppressed because it is too large

58296
datasets/_search_engines/graph/nodes

File diff suppressed because it is too large

40056
datasets/_weather/graph/nodes

File diff suppressed because it is too large

80
datasets/list2matrix.c

@ -1,80 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv){
FILE *fnodes;
char nodes_file[1000];
FILE *flist;
char list_file[1000];
FILE *fmatrix;
char matrix_file[1000];
int i,j;
int **E;
char *path;
int N; // number of nodes
/***
The argument for the program is the directory name
of the query for which we want to create the adjacency matrix
***/
if (argc != 2){
printf("list2matrix query_directory\n");
exit(1);
}
path = strdup(argv[1]);
/*** open the nodes file to obtain the number of nodes ***/
sprintf(nodes_file,"%s/graph/nodes",path);
fnodes = fopen(nodes_file,"r");
if (fnodes == NULL){
printf("ERROR: Cant open file %s\n",nodes_file);
exit(1);
}
fscanf(fnodes,"%d",&N);
fclose(fnodes);
/**** Read List and Construct the adjacency matrix E ****/
E = (int **)malloc(N*sizeof(int *));
for (i = 0; i < N; i ++){
E[i] = (int *)malloc(N*sizeof(int));
for (j = 0; j < N; j ++){
E[i][j] = 0;
}
}
sprintf(list_file,"%s/graph/adj_list",path);
flist = fopen(list_file,"r");
for (i = 0; i < N; i ++){
fscanf(flist,"%*d: %d",&j);
while (j != -1){
E[i][j] = 1;
fscanf(flist,"%d",&j);
}
}
fclose(flist);
/*** print the adjacency matrix ***/
sprintf(matrix_file,"%s/graph/adj_matrix",path);
fmatrix = fopen(matrix_file,"w");
for (i = 0; i < N; i ++){
for (j = 0; j < N; j ++){
fprintf(fmatrix,"%d ", E[i][j]);
}
fprintf(fmatrix,"\n");
}
fclose(fmatrix);
}

5
datasets/smallset

@ -1,5 +0,0 @@
0 0 0 0 0
1 0 1 0 0
1 1 0 1 1
0 0 0 0 1
0 0 0 1 0

1800
datasets/toronto_death_penalty_refined_adj

File diff suppressed because it is too large

BIN
serial_v2/pagerank.out

Binary file not shown.

1
serial_v2/pagerank_output

File diff suppressed because one or more lines are too long

BIN
serial_v2/serial_gs_pagerank.o

Binary file not shown.

BIN
serial_v2/serial_gs_pagerank_functions.o

Binary file not shown.
Loading…
Cancel
Save