Difference between revisions of "Metrics/Data Catalog"

From Creative Commons
Jump to: navigation, search
Line 1: Line 1:
 +
 +
== per license link: search engine queries ==
 +
The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.
 +
 +
Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:
 
<pre>
 
<pre>
 
mysql> desc simple;
 
mysql> desc simple;
Line 26: Line 31:
 
   KEY `timestamp_index` (`timestamp`)
 
   KEY `timestamp_index` (`timestamp`)
 
);
 
);
 +
 +
INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0');
 +
 +
mysql> select now(), count(*) from simple;
 +
+---------------------+----------+
 +
| now()              | count(*) |
 +
+---------------------+----------+
 +
| 2010-03-09 18:21:26 |  2029989 |
 +
+---------------------+----------+
 +
 +
mysql> select count(distinct(license_uri)) from simple;
 +
+------------------------------+
 +
| count(distinct(license_uri)) |
 +
+------------------------------+
 +
|                          540 |
 +
+------------------------------+
 +
 +
mysql> select distinct(search_engine) from simple;
 +
+---------------+
 +
| search_engine |
 +
+---------------+
 +
| MSN          |
 +
| Google        |
 +
| Yahoo        |
 +
| All The Web  |
 +
+---------------+
 +
 +
mysql> select min(timestamp), max(timestamp) from simple;
 +
+---------------------+---------------------+
 +
| min(timestamp)      | max(timestamp)      |
 +
+---------------------+---------------------+
 +
| 2003-05-23 23:20:43 | 2010-02-25 00:05:21 |
 +
+---------------------+---------------------+
 +
 +
mysql> select count(distinct(jurisdiction)) from simple;
 +
+-------------------------------+
 +
| count(distinct(jurisdiction)) |
 +
+-------------------------------+
 +
|                            55 |
 +
+-------------------------------+
 +
 +
mysql> select distinct(license_type) from simple;
 +
+--------------+
 +
| license_type |
 +
+--------------+
 +
| by-nc        |
 +
| by-nc-nd    |
 +
| nc          |
 +
| nd          |
 +
| by-sa        |
 +
| by          |
 +
| nc-sa        |
 +
| nd-nc        |
 +
| sa          |
 +
| by-nd        |
 +
| by-nc-sa    |
 +
| sampling    |
 +
| sampling+    |
 +
| nc-sampling+ |
 +
| publicdomain |
 +
| GPL          |
 +
| LGPL        |
 +
| devnations  |
 +
| NULL        |
 +
| nc-nd        |
 +
| zero        |
 +
+--------------+
 +
 +
mysql> select distinct(license_version) from simple;
 +
+-----------------+
 +
| license_version |
 +
+-----------------+
 +
| 1.0            |
 +
| 2.0            |
 +
| 2.5            |
 +
| 2.1            |
 +
|                |
 +
| NULL            |
 +
| 3.0            |
 +
+-----------------+
 +
 +
 +
mysql> select distinct(license_uri) from simple where license_version is NULL;
 +
+--------------------------------------------------+
 +
| license_uri                                      |
 +
+--------------------------------------------------+
 +
| http://creativecommons.org                      |
 +
| http://www.creativecommons.org                  |
 +
| http://creativecommons.org/licenses/publicdomain |
 +
+--------------------------------------------------+
 +
 +
These can be removed for many purposes:
 +
mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 +
+--------------------------------------------------+
 +
| license_uri                                      |
 +
+--------------------------------------------------+
 +
| http://creativecommons.org/licenses/GPL/2.0/    |
 +
| http://creativecommons.org/licenses/LGPL/2.1/    |
 +
| http://creativecommons.org                      |
 +
| http://www.creativecommons.org                  |
 +
| http://creativecommons.org/licenses/publicdomain |
 +
+--------------------------------------------------+
 +
 +
mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 +
Query OK, 22822 rows affected (10.56 sec)
  
  
INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0');
 
 
</pre>
 
</pre>
 +
 +
 +
  
 
<pre>
 
<pre>

Revision as of 02:36, 10 March 2010

per license link: search engine queries

The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.

Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:

mysql> desc simple;
+-----------------+--------------+------+-----+---------------------+----------------+
| Field           | Type         | Null | Key | Default             | Extra          |
+-----------------+--------------+------+-----+---------------------+----------------+
| id              | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_uri     | varchar(255) | NO   |     |                     |                |
| search_engine   | varchar(255) | NO   |     |                     |                |
| count           | int(11)      | NO   |     | 0                   |                |
| timestamp       | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| jurisdiction    | varchar(255) | YES  |     | NULL                |                |
| license_type    | varchar(16)  | YES  |     | NULL                |                |
| license_version | varchar(16)  | YES  |     | NULL                |                |
+-----------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `simple` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `jurisdiction` varchar(255) default NULL,
  `license_type` varchar(16) default NULL,
  `license_version` varchar(16) default NULL,
  PRIMARY KEY  (`id`),
  KEY `timestamp_index` (`timestamp`)
);

INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0');

mysql> select now(), count(*) from simple;
+---------------------+----------+
| now()               | count(*) |
+---------------------+----------+
| 2010-03-09 18:21:26 |  2029989 |
+---------------------+----------+

mysql> select count(distinct(license_uri)) from simple;
+------------------------------+
| count(distinct(license_uri)) |
+------------------------------+
|                          540 |
+------------------------------+

mysql> select distinct(search_engine) from simple;
+---------------+
| search_engine |
+---------------+
| MSN           |
| Google        |
| Yahoo         |
| All The Web   |
+---------------+

mysql> select min(timestamp), max(timestamp) from simple;
+---------------------+---------------------+
| min(timestamp)      | max(timestamp)      |
+---------------------+---------------------+
| 2003-05-23 23:20:43 | 2010-02-25 00:05:21 |
+---------------------+---------------------+

mysql> select count(distinct(jurisdiction)) from simple;
+-------------------------------+
| count(distinct(jurisdiction)) |
+-------------------------------+
|                            55 |
+-------------------------------+

mysql> select distinct(license_type) from simple;
+--------------+
| license_type |
+--------------+
| by-nc        |
| by-nc-nd     |
| nc           |
| nd           |
| by-sa        |
| by           |
| nc-sa        |
| nd-nc        |
| sa           |
| by-nd        |
| by-nc-sa     |
| sampling     |
| sampling+    |
| nc-sampling+ |
| publicdomain |
| GPL          |
| LGPL         |
| devnations   |
| NULL         |
| nc-nd        |
| zero         |
+--------------+

mysql> select distinct(license_version) from simple;
+-----------------+
| license_version |
+-----------------+
| 1.0             |
| 2.0             |
| 2.5             |
| 2.1             |
|                 |
| NULL            |
| 3.0             |
+-----------------+


mysql> select distinct(license_uri) from simple where license_version is NULL;
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

These can be removed for many purposes:
mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org/licenses/GPL/2.0/     |
| http://creativecommons.org/licenses/LGPL/2.1/    |
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
Query OK, 22822 rows affected (10.56 sec)




mysql> desc complex;
+-------------------+--------------+------+-----+---------------------+----------------+
| Field             | Type         | Null | Key | Default             | Extra          |
+-------------------+--------------+------+-----+---------------------+----------------+
| id                | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_specifier | varchar(255) | NO   |     |                     |                |
| search_engine     | varchar(255) | NO   |     |                     |                |
| count             | int(11)      | NO   |     | 0                   |                |
| query             | varchar(255) | YES  |     | NULL                |                |
| timestamp         | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| country           | varchar(255) | YES  |     | NULL                |                |
| language          | varchar(255) | YES  |     | NULL                |                |
+-------------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `complex` (
  `id` int(11) NOT NULL auto_increment,
  `license_specifier` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `query` varchar(255) default NULL,
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `country` varchar(255) default NULL,
  `language` varchar(255) default NULL,
  PRIMARY KEY  (`id`),
  KEY `complex_timestamp_index` (`timestamp`)
);

INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');

mysql> desc site_specific;
+----------------+--------------+------+-----+---------+----------------+
| Field          | Type         | Null | Key | Default | Extra          |
+----------------+--------------+------+-----+---------+----------------+
| id             | int(11)      | NO   | PRI | NULL    | auto_increment |
| license_uri    | varchar(255) | NO   |     | NULL    |                |
| site           | varchar(255) | NO   |     | NULL    |                |
| count          | int(11)      | NO   |     | NULL    |                |
| utc_time_stamp | datetime     | NO   | MUL | NULL    |                |
+----------------+--------------+------+-----+---------+----------------+

CREATE TABLE `site_specific` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL,
  `site` varchar(255) NOT NULL,
  `count` int(11) NOT NULL,
  `utc_time_stamp` datetime NOT NULL,
  PRIMARY KEY  (`id`),
  UNIQUE KEY `utc_time_stamp` (`utc_time_stamp`,`license_uri`,`site`),
  KEY `site_specific_timestamp_index` (`utc_time_stamp`)
);

INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');