Metrics/Data Catalog
Revision as of 02:36, 10 March 2010 by Mike Linksvayer (talk | contribs)
per license link: search engine queries
The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.
Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:
mysql> desc simple; +-----------------+--------------+------+-----+---------------------+----------------+ | Field | Type | Null | Key | Default | Extra | +-----------------+--------------+------+-----+---------------------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_uri | varchar(255) | NO | | | | | search_engine | varchar(255) | NO | | | | | count | int(11) | NO | | 0 | | | timestamp | datetime | NO | MUL | 0000-00-00 00:00:00 | | | jurisdiction | varchar(255) | YES | | NULL | | | license_type | varchar(16) | YES | | NULL | | | license_version | varchar(16) | YES | | NULL | | +-----------------+--------------+------+-----+---------------------+----------------+ CREATE TABLE `simple` ( `id` int(11) NOT NULL auto_increment, `license_uri` varchar(255) NOT NULL default '', `search_engine` varchar(255) NOT NULL default '', `count` int(11) NOT NULL default '0', `timestamp` datetime NOT NULL default '0000-00-00 00:00:00', `jurisdiction` varchar(255) default NULL, `license_type` varchar(16) default NULL, `license_version` varchar(16) default NULL, PRIMARY KEY (`id`), KEY `timestamp_index` (`timestamp`) ); INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0'); mysql> select now(), count(*) from simple; +---------------------+----------+ | now() | count(*) | +---------------------+----------+ | 2010-03-09 18:21:26 | 2029989 | +---------------------+----------+ mysql> select count(distinct(license_uri)) from simple; +------------------------------+ | count(distinct(license_uri)) | +------------------------------+ | 540 | +------------------------------+ mysql> select distinct(search_engine) from simple; +---------------+ | search_engine | +---------------+ | MSN | | Google | | Yahoo | | All The Web | +---------------+ mysql> select min(timestamp), max(timestamp) from simple; +---------------------+---------------------+ | min(timestamp) | max(timestamp) | +---------------------+---------------------+ | 2003-05-23 23:20:43 | 2010-02-25 00:05:21 | +---------------------+---------------------+ mysql> select count(distinct(jurisdiction)) from simple; +-------------------------------+ | count(distinct(jurisdiction)) | +-------------------------------+ | 55 | +-------------------------------+ mysql> select distinct(license_type) from simple; +--------------+ | license_type | +--------------+ | by-nc | | by-nc-nd | | nc | | nd | | by-sa | | by | | nc-sa | | nd-nc | | sa | | by-nd | | by-nc-sa | | sampling | | sampling+ | | nc-sampling+ | | publicdomain | | GPL | | LGPL | | devnations | | NULL | | nc-nd | | zero | +--------------+ mysql> select distinct(license_version) from simple; +-----------------+ | license_version | +-----------------+ | 1.0 | | 2.0 | | 2.5 | | 2.1 | | | | NULL | | 3.0 | +-----------------+ mysql> select distinct(license_uri) from simple where license_version is NULL; +--------------------------------------------------+ | license_uri | +--------------------------------------------------+ | http://creativecommons.org | | http://www.creativecommons.org | | http://creativecommons.org/licenses/publicdomain | +--------------------------------------------------+ These can be removed for many purposes: mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; +--------------------------------------------------+ | license_uri | +--------------------------------------------------+ | http://creativecommons.org/licenses/GPL/2.0/ | | http://creativecommons.org/licenses/LGPL/2.1/ | | http://creativecommons.org | | http://www.creativecommons.org | | http://creativecommons.org/licenses/publicdomain | +--------------------------------------------------+ mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL'; Query OK, 22822 rows affected (10.56 sec)
mysql> desc complex; +-------------------+--------------+------+-----+---------------------+----------------+ | Field | Type | Null | Key | Default | Extra | +-------------------+--------------+------+-----+---------------------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_specifier | varchar(255) | NO | | | | | search_engine | varchar(255) | NO | | | | | count | int(11) | NO | | 0 | | | query | varchar(255) | YES | | NULL | | | timestamp | datetime | NO | MUL | 0000-00-00 00:00:00 | | | country | varchar(255) | YES | | NULL | | | language | varchar(255) | YES | | NULL | | +-------------------+--------------+------+-----+---------------------+----------------+ CREATE TABLE `complex` ( `id` int(11) NOT NULL auto_increment, `license_specifier` varchar(255) NOT NULL default '', `search_engine` varchar(255) NOT NULL default '', `count` int(11) NOT NULL default '0', `query` varchar(255) default NULL, `timestamp` datetime NOT NULL default '0000-00-00 00:00:00', `country` varchar(255) default NULL, `language` varchar(255) default NULL, PRIMARY KEY (`id`), KEY `complex_timestamp_index` (`timestamp`) ); INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');
mysql> desc site_specific; +----------------+--------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +----------------+--------------+------+-----+---------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | license_uri | varchar(255) | NO | | NULL | | | site | varchar(255) | NO | | NULL | | | count | int(11) | NO | | NULL | | | utc_time_stamp | datetime | NO | MUL | NULL | | +----------------+--------------+------+-----+---------+----------------+ CREATE TABLE `site_specific` ( `id` int(11) NOT NULL auto_increment, `license_uri` varchar(255) NOT NULL, `site` varchar(255) NOT NULL, `count` int(11) NOT NULL, `utc_time_stamp` datetime NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `utc_time_stamp` (`utc_time_stamp`,`license_uri`,`site`), KEY `site_specific_timestamp_index` (`utc_time_stamp`) ); INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');