Apostolos Fanakis
6 years ago
21 changed files with 124 additions and 164006 deletions
@ -0,0 +1,23 @@ |
|||||
|
The datasets on this folder where downloaded from the website of the Stanford Network Analysis Project (SNAP), found [here](https://snap.stanford.edu/data/). |
||||
|
|
||||
|
More details about the datasets can be found in the table bellow. |
||||
|
|
||||
|
| Dataset directory | Description | Nodes | Edges | URL link | |
||||
|
| ----------- | ----------- | ----------- | ----------- | ----------- | |
||||
|
| web-Google | "web-Google" | 875,713 | 5,105,039 | [link](https://snap.stanford.edu/data/web-Google.html) | |
||||
|
| wiki-Talk | "wiki-Talk" | 2,394,385 | 5,021,410 | [link](https://snap.stanford.edu/data/wiki-Talk.html) | |
||||
|
|
||||
|
### Adjustments made to the datasets: |
||||
|
The datasets had four (4) lines of meta-data at the beginning of the files and the data were saved in a form that had one edge per line following the pattern "`linkFrom\tlinkTo\n`", like so: |
||||
|
``` |
||||
|
linkFrom linkTo |
||||
|
linkFrom linkTo |
||||
|
... |
||||
|
``` |
||||
|
A program in C was written to discard the meta-data lines and transform the pattern in a new one that has all the out-links of a page in a single row, like so: |
||||
|
``` |
||||
|
page_1: linkTo_1 linkTo_2 linkTo_3 ... |
||||
|
page_2: linkTo_1 ... |
||||
|
``` |
||||
|
|
||||
|
The program is provided in this repository, under the pathname: `/datasets/Stanford Large Network Dataset Collection/graphToAdjacencyList.c`. |
@ -0,0 +1,90 @@ |
|||||
|
#include <stdio.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <string.h> |
||||
|
|
||||
|
typedef struct pair { |
||||
|
int from; |
||||
|
int to; |
||||
|
} Pair; |
||||
|
|
||||
|
int main(int argc, char const *argv[]) { |
||||
|
if (argc != 2) { |
||||
|
printf("Usage: ./graphToAdjacencyList <graphFile>\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
|
||||
|
FILE *graphFile = fopen(argv[1], "r"); |
||||
|
if (!graphFile) { |
||||
|
printf("Error opening file \n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
|
||||
|
char buffer[512]; |
||||
|
// Skips the first two lines
|
||||
|
fgets(buffer, 512, graphFile); |
||||
|
fgets(buffer, 512, graphFile); |
||||
|
|
||||
|
// Third line has the numbers of nodes and edges, we need to parse them.
|
||||
|
int numberOfNodes = 0, numberOfEdges; |
||||
|
fgets(buffer, 512, graphFile); |
||||
|
|
||||
|
char *token = strtok(buffer, " "); |
||||
|
int getNodes = 0, getEdges = 0; |
||||
|
|
||||
|
while (token != NULL) { |
||||
|
if (strcmp(token, "Nodes:") == 0) { |
||||
|
getNodes = 1; |
||||
|
} else if (getNodes == 1) { |
||||
|
numberOfNodes = atoi(token); |
||||
|
getNodes = 0; |
||||
|
} else if (strcmp(token, "Edges:") == 0) { |
||||
|
getEdges = 1; |
||||
|
} else if (getEdges == 1) { |
||||
|
numberOfEdges = atoi(token); |
||||
|
break; |
||||
|
} |
||||
|
token = strtok (NULL, " ,.-"); |
||||
|
} |
||||
|
|
||||
|
// Skips the fourth line
|
||||
|
fgets(buffer, 512, graphFile); |
||||
|
|
||||
|
Pair **edges = (Pair **) malloc(numberOfEdges * sizeof(Pair *)); |
||||
|
printf("Reading edges from file...\n"); |
||||
|
for (int i=0; i<numberOfEdges; i++) { |
||||
|
edges[i] = (Pair *) malloc(sizeof(Pair)); |
||||
|
|
||||
|
int f_from = 0, f_to = 0; |
||||
|
if (!fscanf(graphFile, "%d %d", &f_from, &f_to)) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
edges[i]->from = f_from; |
||||
|
edges[i]->to = f_to; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
FILE *adjacentListFile = fopen("adjacentListCreated", "w"); |
||||
|
int index = 0; |
||||
|
|
||||
|
printf("\nWriting nodes to file...\n"); |
||||
|
for (int i=0; i<numberOfNodes; ++i) { |
||||
|
int hasOutlinks = 0; |
||||
|
|
||||
|
fprintf(adjacentListFile, "%d: ", i); |
||||
|
|
||||
|
while (index < numberOfEdges && edges[index]->from == i) { |
||||
|
fprintf(adjacentListFile, "%d ", edges[index]->to); |
||||
|
if (!hasOutlinks) hasOutlinks = 1; |
||||
|
++index; |
||||
|
} |
||||
|
|
||||
|
if (!hasOutlinks) { |
||||
|
fprintf(adjacentListFile, "-1 "); |
||||
|
} |
||||
|
|
||||
|
fprintf(adjacentListFile, "\n"); |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
Binary file not shown.
Binary file not shown.
@ -0,0 +1,10 @@ |
|||||
|
The datasets on this folder where downloaded from the website of the computer science course at University of Toronto, found [here](http://www.cs.toronto.edu/~tsap/experiments/download/download.html). |
||||
|
|
||||
|
More details about the datasets can be found in the table bellow. |
||||
|
|
||||
|
| Dataset directory | Description | URL link | |
||||
|
| ----------- | ----------- | ----------- | |
||||
|
| \_architecture | "architecture" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_architecture.tar.Z) | |
||||
|
| \_blues | "blues" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_blues.tar.Z) | |
||||
|
| \_search_engines | "search_engines" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_search_engines.tar.Z) | |
||||
|
| \_weather | "weather" | [link](http://www.cs.toronto.edu/~tsap/experiments/download/_weather.tar.Z) | |
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -1,80 +0,0 @@ |
|||||
#include <stdio.h> |
|
||||
#include <stdlib.h> |
|
||||
#include <string.h> |
|
||||
|
|
||||
|
|
||||
int main(int argc, char **argv){ |
|
||||
|
|
||||
FILE *fnodes; |
|
||||
char nodes_file[1000]; |
|
||||
FILE *flist; |
|
||||
char list_file[1000]; |
|
||||
FILE *fmatrix; |
|
||||
char matrix_file[1000]; |
|
||||
int i,j; |
|
||||
int **E; |
|
||||
char *path; |
|
||||
int N; // number of nodes
|
|
||||
|
|
||||
/***
|
|
||||
The argument for the program is the directory name |
|
||||
of the query for which we want to create the adjacency matrix |
|
||||
***/ |
|
||||
|
|
||||
if (argc != 2){ |
|
||||
printf("list2matrix query_directory\n"); |
|
||||
exit(1); |
|
||||
} |
|
||||
|
|
||||
path = strdup(argv[1]); |
|
||||
|
|
||||
|
|
||||
/*** open the nodes file to obtain the number of nodes ***/ |
|
||||
|
|
||||
sprintf(nodes_file,"%s/graph/nodes",path); |
|
||||
fnodes = fopen(nodes_file,"r"); |
|
||||
if (fnodes == NULL){ |
|
||||
printf("ERROR: Cant open file %s\n",nodes_file); |
|
||||
exit(1); |
|
||||
} |
|
||||
fscanf(fnodes,"%d",&N); |
|
||||
|
|
||||
fclose(fnodes); |
|
||||
|
|
||||
|
|
||||
/**** Read List and Construct the adjacency matrix E ****/ |
|
||||
|
|
||||
E = (int **)malloc(N*sizeof(int *)); |
|
||||
for (i = 0; i < N; i ++){ |
|
||||
E[i] = (int *)malloc(N*sizeof(int)); |
|
||||
for (j = 0; j < N; j ++){ |
|
||||
E[i][j] = 0; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
sprintf(list_file,"%s/graph/adj_list",path); |
|
||||
flist = fopen(list_file,"r"); |
|
||||
for (i = 0; i < N; i ++){ |
|
||||
fscanf(flist,"%*d: %d",&j); |
|
||||
while (j != -1){ |
|
||||
E[i][j] = 1; |
|
||||
fscanf(flist,"%d",&j); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
fclose(flist); |
|
||||
|
|
||||
|
|
||||
/*** print the adjacency matrix ***/ |
|
||||
|
|
||||
sprintf(matrix_file,"%s/graph/adj_matrix",path); |
|
||||
fmatrix = fopen(matrix_file,"w"); |
|
||||
for (i = 0; i < N; i ++){ |
|
||||
for (j = 0; j < N; j ++){ |
|
||||
fprintf(fmatrix,"%d ", E[i][j]); |
|
||||
} |
|
||||
fprintf(fmatrix,"\n"); |
|
||||
} |
|
||||
fclose(fmatrix); |
|
||||
} |
|
||||
|
|
@ -1,5 +0,0 @@ |
|||||
0 0 0 0 0 |
|
||||
1 0 1 0 0 |
|
||||
1 1 0 1 1 |
|
||||
0 0 0 0 1 |
|
||||
0 0 0 1 0 |
|
File diff suppressed because it is too large
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue