Difference between revisions of "Metrics/Data Catalog"
Line 1: | Line 1: | ||
+ | |||
+ | == per license link: search engine queries == | ||
+ | The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number. | ||
+ | |||
+ | Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values: | ||
<pre> | <pre> | ||
mysql> desc simple; | mysql> desc simple; | ||
Line 26: | Line 31: | ||
KEY `timestamp_index` (`timestamp`) | KEY `timestamp_index` (`timestamp`) | ||
); | ); | ||
+ | |||
+ | INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0'); | ||
+ | |||
+ | mysql> select now(), count(*) from simple; | ||
+ | +---------------------+----------+ | ||
+ | | now() | count(*) | | ||
+ | +---------------------+----------+ | ||
+ | | 2010-03-09 18:21:26 | 2029989 | | ||
+ | +---------------------+----------+ | ||
+ | |||
+ | mysql> select count(distinct(license_uri)) from simple; | ||
+ | +------------------------------+ | ||
+ | | count(distinct(license_uri)) | | ||
+ | +------------------------------+ | ||
+ | | 540 | | ||
+ | +------------------------------+ | ||
+ | |||
+ | mysql> select distinct(search_engine) from simple; | ||
+ | +---------------+ | ||
+ | | search_engine | | ||
+ | +---------------+ | ||
+ | | MSN | | ||
+ | | Google | | ||
+ | | Yahoo | | ||
+ | | All The Web | | ||
+ | +---------------+ | ||
+ | |||
+ | mysql> select min(timestamp), max(timestamp) from simple; | ||
+ | +---------------------+---------------------+ | ||
+ | | min(timestamp) | max(timestamp) | | ||
+ | +---------------------+---------------------+ | ||
+ | | 2003-05-23 23:20:43 | 2010-02-25 00:05:21 | | ||
+ | +---------------------+---------------------+ | ||
+ | |||
+ | mysql> select count(distinct(jurisdiction)) from simple; | ||
+ | +-------------------------------+ | ||
+ | | count(distinct(jurisdiction)) | | ||
+ | +-------------------------------+ | ||
+ | | 55 | | ||
+ | +-------------------------------+ | ||
+ | |||
+ | mysql> select distinct(license_type) from simple; | ||
+ | +--------------+ | ||
+ | | license_type | | ||
+ | +--------------+ | ||
+ | | by-nc | | ||
+ | | by-nc-nd | | ||
+ | | nc | | ||
+ | | nd | | ||
+ | | by-sa | | ||
+ | | by | | ||
+ | | nc-sa | | ||
+ | | nd-nc | | ||
+ | | sa | | ||
+ | | by-nd | | ||
+ | | by-nc-sa | | ||
+ | | sampling | | ||
+ | | sampling+ | | ||
+ | | nc-sampling+ | | ||
+ | | publicdomain | | ||
+ | | GPL | | ||
+ | | LGPL | | ||
+ | | devnations | | ||
+ | | NULL | | ||
+ | | nc-nd | | ||
+ | | zero | | ||
+ | +--------------+ | ||
+ | |||
+ | mysql> select distinct(license_version) from simple; | ||
+ | +-----------------+ | ||
+ | | license_version | | ||
+ | +-----------------+ | ||
+ | | 1.0 | | ||
+ | | 2.0 | | ||
+ | | 2.5 | | ||
+ | | 2.1 | | ||
+ | | | | ||
+ | | NULL | | ||
+ | | 3.0 | | ||
+ | +-----------------+ | ||
+ | |||
+ | |||
+ | mysql> select distinct(license_uri) from simple where license_version is NULL; | ||
+ | +--------------------------------------------------+ | ||
+ | | license_uri | | ||
+ | +--------------------------------------------------+ | ||
+ | | http://creativecommons.org | | ||
+ | | http://www.creativecommons.org | | ||
+ | | http://creativecommons.org/licenses/publicdomain | | ||
+ | +--------------------------------------------------+ | ||
+ | |||
+ | These can be removed for many purposes: | ||
+ | mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; | ||
+ | +--------------------------------------------------+ | ||
+ | | license_uri | | ||
+ | +--------------------------------------------------+ | ||
+ | | http://creativecommons.org/licenses/GPL/2.0/ | | ||
+ | | http://creativecommons.org/licenses/LGPL/2.1/ | | ||
+ | | http://creativecommons.org | | ||
+ | | http://www.creativecommons.org | | ||
+ | | http://creativecommons.org/licenses/publicdomain | | ||
+ | +--------------------------------------------------+ | ||
+ | |||
+ | mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; | ||
+ | Query OK, 22822 rows affected (10.56 sec) | ||
− | |||
</pre> | </pre> | ||
+ | |||
+ | |||
+ | |||
<pre> | <pre> |
Revision as of 02:36, 10 March 2010
per license link: search engine queries
The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.
Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:
mysql> desc simple; +-----------------+--------------+------+-----+---------------------+----------------+ | Field | Type | Null | Key | Default | Extra | +-----------------+--------------+------+-----+---------------------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_uri | varchar(255) | NO | | | | | search_engine | varchar(255) | NO | | | | | count | int(11) | NO | | 0 | | | timestamp | datetime | NO | MUL | 0000-00-00 00:00:00 | | | jurisdiction | varchar(255) | YES | | NULL | | | license_type | varchar(16) | YES | | NULL | | | license_version | varchar(16) | YES | | NULL | | +-----------------+--------------+------+-----+---------------------+----------------+ CREATE TABLE `simple` ( `id` int(11) NOT NULL auto_increment, `license_uri` varchar(255) NOT NULL default '', `search_engine` varchar(255) NOT NULL default '', `count` int(11) NOT NULL default '0', `timestamp` datetime NOT NULL default '0000-00-00 00:00:00', `jurisdiction` varchar(255) default NULL, `license_type` varchar(16) default NULL, `license_version` varchar(16) default NULL, PRIMARY KEY (`id`), KEY `timestamp_index` (`timestamp`) ); INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0'); mysql> select now(), count(*) from simple; +---------------------+----------+ | now() | count(*) | +---------------------+----------+ | 2010-03-09 18:21:26 | 2029989 | +---------------------+----------+ mysql> select count(distinct(license_uri)) from simple; +------------------------------+ | count(distinct(license_uri)) | +------------------------------+ | 540 | +------------------------------+ mysql> select distinct(search_engine) from simple; +---------------+ | search_engine | +---------------+ | MSN | | Google | | Yahoo | | All The Web | +---------------+ mysql> select min(timestamp), max(timestamp) from simple; +---------------------+---------------------+ | min(timestamp) | max(timestamp) | +---------------------+---------------------+ | 2003-05-23 23:20:43 | 2010-02-25 00:05:21 | +---------------------+---------------------+ mysql> select count(distinct(jurisdiction)) from simple; +-------------------------------+ | count(distinct(jurisdiction)) | +-------------------------------+ | 55 | +-------------------------------+ mysql> select distinct(license_type) from simple; +--------------+ | license_type | +--------------+ | by-nc | | by-nc-nd | | nc | | nd | | by-sa | | by | | nc-sa | | nd-nc | | sa | | by-nd | | by-nc-sa | | sampling | | sampling+ | | nc-sampling+ | | publicdomain | | GPL | | LGPL | | devnations | | NULL | | nc-nd | | zero | +--------------+ mysql> select distinct(license_version) from simple; +-----------------+ | license_version | +-----------------+ | 1.0 | | 2.0 | | 2.5 | | 2.1 | | | | NULL | | 3.0 | +-----------------+ mysql> select distinct(license_uri) from simple where license_version is NULL; +--------------------------------------------------+ | license_uri | +--------------------------------------------------+ | http://creativecommons.org | | http://www.creativecommons.org | | http://creativecommons.org/licenses/publicdomain | +--------------------------------------------------+ These can be removed for many purposes: mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; +--------------------------------------------------+ | license_uri | +--------------------------------------------------+ | http://creativecommons.org/licenses/GPL/2.0/ | | http://creativecommons.org/licenses/LGPL/2.1/ | | http://creativecommons.org | | http://www.creativecommons.org | | http://creativecommons.org/licenses/publicdomain | +--------------------------------------------------+ mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; Query OK, 22822 rows affected (10.56 sec)
mysql> desc complex; +-------------------+--------------+------+-----+---------------------+----------------+ | Field | Type | Null | Key | Default | Extra | +-------------------+--------------+------+-----+---------------------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_specifier | varchar(255) | NO | | | | | search_engine | varchar(255) | NO | | | | | count | int(11) | NO | | 0 | | | query | varchar(255) | YES | | NULL | | | timestamp | datetime | NO | MUL | 0000-00-00 00:00:00 | | | country | varchar(255) | YES | | NULL | | | language | varchar(255) | YES | | NULL | | +-------------------+--------------+------+-----+---------------------+----------------+ CREATE TABLE `complex` ( `id` int(11) NOT NULL auto_increment, `license_specifier` varchar(255) NOT NULL default '', `search_engine` varchar(255) NOT NULL default '', `count` int(11) NOT NULL default '0', `query` varchar(255) default NULL, `timestamp` datetime NOT NULL default '0000-00-00 00:00:00', `country` varchar(255) default NULL, `language` varchar(255) default NULL, PRIMARY KEY (`id`), KEY `complex_timestamp_index` (`timestamp`) ); INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');
mysql> desc site_specific; +----------------+--------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +----------------+--------------+------+-----+---------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_uri | varchar(255) | NO | | NULL | | | site | varchar(255) | NO | | NULL | | | count | int(11) | NO | | NULL | | | utc_time_stamp | datetime | NO | MUL | NULL | | +----------------+--------------+------+-----+---------+----------------+ CREATE TABLE `site_specific` ( `id` int(11) NOT NULL auto_increment, `license_uri` varchar(255) NOT NULL, `site` varchar(255) NOT NULL, `count` int(11) NOT NULL, `utc_time_stamp` datetime NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `utc_time_stamp` (`utc_time_stamp`,`license_uri`,`site`), KEY `site_specific_timestamp_index` (`utc_time_stamp`) ); INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');