Difference between revisions of "Metrics/Data Catalog"

From Creative Commons
Jump to: navigation, search
(per license link: search engine queries)
Line 111: Line 111:
 
| 3.0            |
 
| 3.0            |
 
+-----------------+
 
+-----------------+
 
  
 
mysql> select distinct(license_uri) from simple where license_version is NULL;
 
mysql> select distinct(license_uri) from simple where license_version is NULL;
Line 121: Line 120:
 
| http://creativecommons.org/licenses/publicdomain |
 
| http://creativecommons.org/licenses/publicdomain |
 
+--------------------------------------------------+
 
+--------------------------------------------------+
 +
</pre>
  
 
These can be removed for many purposes:
 
These can be removed for many purposes:
 +
<pre>
 
mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 
mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 
+--------------------------------------------------+
 
+--------------------------------------------------+
Line 136: Line 137:
 
mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 
mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
 
Query OK, 22822 rows affected (10.56 sec)
 
Query OK, 22822 rows affected (10.56 sec)
 
 
 
</pre>
 
</pre>
  
  
  
 
+
==License property search engine API queries==
 +
Yahoo and Google allow filtering searches by CC license property (eg allows commercial use). This is coarser than per license link: queries, but unlike link: can be (indeed must be) combined with keyword and optionally country and language operators. This facilitates another take on the distribution of CC licenses not dependent on availability or use of jurisdiction ported licenses. Currently only Yahoo is regularly queried. Countries and languages are those supported for Yahoo queries.
 
<pre>
 
<pre>
 
mysql> desc complex;
 
mysql> desc complex;
Line 173: Line 173:
 
INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');
 
INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');
  
 +
mysql> select count(*), license_specifier, search_engine from complex group by license_specifier, search_engine;
 +
+----------+---------------------------------------------------------------------------+---------------+
 +
| count(*) | license_specifier                                                        | search_engine |
 +
+----------+---------------------------------------------------------------------------+---------------+
 +
|  199255 |                                                                          | All The Web  |
 +
|  160438 |                                                                          | Yahoo        |
 +
|      960 | &ccs=c                                                                    | Yahoo        |
 +
|      960 | &ccs=c&ccs=e                                                              | Yahoo        |
 +
|      960 | &ccs=e                                                                    | Yahoo        |
 +
|  4619917 | cc_any                                                                    | Yahoo        |
 +
|      898 | cc_attribute                                                              | Google        |
 +
|  4483821 | cc_commercial                                                            | Yahoo        |
 +
|  4318628 | cc_commercial&cc_modifiable                                              | Yahoo        |
 +
|  4387633 | cc_modifiable                                                            | Yahoo        |
 +
|      900 | cc_noncommercial                                                          | Google        |
 +
|      901 | cc_nonderived                                                            | Google        |
 +
|      900 | cc_publicdomain                                                          | Google        |
 +
|      912 | cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived | Google        |
 +
|      914 | cc_sharealike                                                            | Google        |
 +
+----------+---------------------------------------------------------------------------+---------------+
 +
 +
mysql> select distinct query from complex;
 +
+---------------------+
 +
| query              |
 +
+---------------------+
 +
| license            |
 +
| -license            |
 +
| work                |
 +
| -work              |
 +
| html                |
 +
| -html              |
 +
| license OR -license |
 +
| work OR -work      |
 +
| html OR -html      |
 +
+---------------------+
 +
9 rows in set (25.23 sec)
 +
 +
mysql> select min(timestamp),max(timestamp) from complex;
 +
+---------------------+---------------------+
 +
| min(timestamp)      | max(timestamp)      |
 +
+---------------------+---------------------+
 +
| 2005-06-21 00:00:00 | 2010-02-25 00:05:21 |
 +
+---------------------+---------------------+
 +
 +
mysql> select count(distinct(country)) from complex;
 +
+--------------------------+
 +
| count(distinct(country)) |
 +
+--------------------------+
 +
|                      24 |
 +
+--------------------------+
 +
 +
mysql> select count(distinct(language)) from complex;
 +
+---------------------------+
 +
| count(distinct(language)) |
 +
+---------------------------+
 +
|                        31 |
 +
+---------------------------+
 
</pre>
 
</pre>
  
 +
 +
==Content repository queries==
 +
Obtain number of licensed works from individual repositories (currently only Flickr).
 
<pre>
 
<pre>
 
mysql> desc site_specific;
 
mysql> desc site_specific;
Line 199: Line 259:
  
 
INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');
 
INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');
 +
 +
mysql> select distinct(license_uri) from site_specific;
 +
+---------------------------------------------------+
 +
| license_uri                                      |
 +
+---------------------------------------------------+
 +
| http://creativecommons.org/licenses/by-nc-nd/2.0/ |
 +
| http://creativecommons.org/licenses/by-nc-sa/2.0/ |
 +
| http://creativecommons.org/licenses/by-nc/2.0/    |
 +
| http://creativecommons.org/licenses/by-nd/2.0/    |
 +
| http://creativecommons.org/licenses/by-sa/2.0/    |
 +
| http://creativecommons.org/licenses/by/2.0/      |
 +
+---------------------------------------------------+
 +
 +
mysql> select min(utc_time_stamp),max(utc_time_stamp) from site_specific;
 +
+---------------------+---------------------+
 +
| min(utc_time_stamp) | max(utc_time_stamp) |
 +
+---------------------+---------------------+
 +
| 2006-03-17 12:00:00 | 2010-02-25 08:05:01 |
 +
+---------------------+---------------------+
 +
 
</pre>
 
</pre>

Revision as of 03:10, 10 March 2010

per license link: search engine queries

The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.

Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:

mysql> desc simple;
+-----------------+--------------+------+-----+---------------------+----------------+
| Field           | Type         | Null | Key | Default             | Extra          |
+-----------------+--------------+------+-----+---------------------+----------------+
| id              | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_uri     | varchar(255) | NO   |     |                     |                |
| search_engine   | varchar(255) | NO   |     |                     |                |
| count           | int(11)      | NO   |     | 0                   |                |
| timestamp       | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| jurisdiction    | varchar(255) | YES  |     | NULL                |                |
| license_type    | varchar(16)  | YES  |     | NULL                |                |
| license_version | varchar(16)  | YES  |     | NULL                |                |
+-----------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `simple` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `jurisdiction` varchar(255) default NULL,
  `license_type` varchar(16) default NULL,
  `license_version` varchar(16) default NULL,
  PRIMARY KEY  (`id`),
  KEY `timestamp_index` (`timestamp`)
);

INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0');

mysql> select now(), count(*) from simple;
+---------------------+----------+
| now()               | count(*) |
+---------------------+----------+
| 2010-03-09 18:21:26 |  2029989 |
+---------------------+----------+

mysql> select count(distinct(license_uri)) from simple;
+------------------------------+
| count(distinct(license_uri)) |
+------------------------------+
|                          540 |
+------------------------------+

mysql> select distinct(search_engine) from simple;
+---------------+
| search_engine |
+---------------+
| MSN           |
| Google        |
| Yahoo         |
| All The Web   |
+---------------+

mysql> select min(timestamp), max(timestamp) from simple;
+---------------------+---------------------+
| min(timestamp)      | max(timestamp)      |
+---------------------+---------------------+
| 2003-05-23 23:20:43 | 2010-02-25 00:05:21 |
+---------------------+---------------------+

mysql> select count(distinct(jurisdiction)) from simple;
+-------------------------------+
| count(distinct(jurisdiction)) |
+-------------------------------+
|                            55 |
+-------------------------------+

mysql> select distinct(license_type) from simple;
+--------------+
| license_type |
+--------------+
| by-nc        |
| by-nc-nd     |
| nc           |
| nd           |
| by-sa        |
| by           |
| nc-sa        |
| nd-nc        |
| sa           |
| by-nd        |
| by-nc-sa     |
| sampling     |
| sampling+    |
| nc-sampling+ |
| publicdomain |
| GPL          |
| LGPL         |
| devnations   |
| NULL         |
| nc-nd        |
| zero         |
+--------------+

mysql> select distinct(license_version) from simple;
+-----------------+
| license_version |
+-----------------+
| 1.0             |
| 2.0             |
| 2.5             |
| 2.1             |
|                 |
| NULL            |
| 3.0             |
+-----------------+

mysql> select distinct(license_uri) from simple where license_version is NULL;
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

These can be removed for many purposes:

mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org/licenses/GPL/2.0/     |
| http://creativecommons.org/licenses/LGPL/2.1/    |
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
Query OK, 22822 rows affected (10.56 sec)


License property search engine API queries

Yahoo and Google allow filtering searches by CC license property (eg allows commercial use). This is coarser than per license link: queries, but unlike link: can be (indeed must be) combined with keyword and optionally country and language operators. This facilitates another take on the distribution of CC licenses not dependent on availability or use of jurisdiction ported licenses. Currently only Yahoo is regularly queried. Countries and languages are those supported for Yahoo queries.

mysql> desc complex;
+-------------------+--------------+------+-----+---------------------+----------------+
| Field             | Type         | Null | Key | Default             | Extra          |
+-------------------+--------------+------+-----+---------------------+----------------+
| id                | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_specifier | varchar(255) | NO   |     |                     |                |
| search_engine     | varchar(255) | NO   |     |                     |                |
| count             | int(11)      | NO   |     | 0                   |                |
| query             | varchar(255) | YES  |     | NULL                |                |
| timestamp         | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| country           | varchar(255) | YES  |     | NULL                |                |
| language          | varchar(255) | YES  |     | NULL                |                |
+-------------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `complex` (
  `id` int(11) NOT NULL auto_increment,
  `license_specifier` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `query` varchar(255) default NULL,
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `country` varchar(255) default NULL,
  `language` varchar(255) default NULL,
  PRIMARY KEY  (`id`),
  KEY `complex_timestamp_index` (`timestamp`)
);

INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');

mysql> select count(*), license_specifier, search_engine from complex group by license_specifier, search_engine;
+----------+---------------------------------------------------------------------------+---------------+
| count(*) | license_specifier                                                         | search_engine |
+----------+---------------------------------------------------------------------------+---------------+
|   199255 |                                                                           | All The Web   |
|   160438 |                                                                           | Yahoo         |
|      960 | &ccs=c                                                                    | Yahoo         |
|      960 | &ccs=c&ccs=e                                                              | Yahoo         |
|      960 | &ccs=e                                                                    | Yahoo         |
|  4619917 | cc_any                                                                    | Yahoo         |
|      898 | cc_attribute                                                              | Google        |
|  4483821 | cc_commercial                                                             | Yahoo         |
|  4318628 | cc_commercial&cc_modifiable                                               | Yahoo         |
|  4387633 | cc_modifiable                                                             | Yahoo         |
|      900 | cc_noncommercial                                                          | Google        |
|      901 | cc_nonderived                                                             | Google        |
|      900 | cc_publicdomain                                                           | Google        |
|      912 | cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived | Google        |
|      914 | cc_sharealike                                                             | Google        |
+----------+---------------------------------------------------------------------------+---------------+

mysql> select distinct query from complex;
+---------------------+
| query               |
+---------------------+
| license             |
| -license            |
| work                |
| -work               |
| html                |
| -html               |
| license OR -license |
| work OR -work       |
| html OR -html       |
+---------------------+
9 rows in set (25.23 sec)

mysql> select min(timestamp),max(timestamp) from complex;
+---------------------+---------------------+
| min(timestamp)      | max(timestamp)      |
+---------------------+---------------------+
| 2005-06-21 00:00:00 | 2010-02-25 00:05:21 |
+---------------------+---------------------+

mysql> select count(distinct(country)) from complex;
+--------------------------+
| count(distinct(country)) |
+--------------------------+
|                       24 |
+--------------------------+

mysql> select count(distinct(language)) from complex;
+---------------------------+
| count(distinct(language)) |
+---------------------------+
|                        31 |
+---------------------------+


Content repository queries

Obtain number of licensed works from individual repositories (currently only Flickr).

mysql> desc site_specific;
+----------------+--------------+------+-----+---------+----------------+
| Field          | Type         | Null | Key | Default | Extra          |
+----------------+--------------+------+-----+---------+----------------+
| id             | int(11)      | NO   | PRI | NULL    | auto_increment |
| license_uri    | varchar(255) | NO   |     | NULL    |                |
| site           | varchar(255) | NO   |     | NULL    |                |
| count          | int(11)      | NO   |     | NULL    |                |
| utc_time_stamp | datetime     | NO   | MUL | NULL    |                |
+----------------+--------------+------+-----+---------+----------------+

CREATE TABLE `site_specific` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL,
  `site` varchar(255) NOT NULL,
  `count` int(11) NOT NULL,
  `utc_time_stamp` datetime NOT NULL,
  PRIMARY KEY  (`id`),
  UNIQUE KEY `utc_time_stamp` (`utc_time_stamp`,`license_uri`,`site`),
  KEY `site_specific_timestamp_index` (`utc_time_stamp`)
);

INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');

mysql> select distinct(license_uri) from site_specific;
+---------------------------------------------------+
| license_uri                                       |
+---------------------------------------------------+
| http://creativecommons.org/licenses/by-nc-nd/2.0/ |
| http://creativecommons.org/licenses/by-nc-sa/2.0/ |
| http://creativecommons.org/licenses/by-nc/2.0/    |
| http://creativecommons.org/licenses/by-nd/2.0/    |
| http://creativecommons.org/licenses/by-sa/2.0/    |
| http://creativecommons.org/licenses/by/2.0/       |
+---------------------------------------------------+

mysql> select min(utc_time_stamp),max(utc_time_stamp) from site_specific;
+---------------------+---------------------+
| min(utc_time_stamp) | max(utc_time_stamp) |
+---------------------+---------------------+
| 2006-03-17 12:00:00 | 2010-02-25 08:05:01 |
+---------------------+---------------------+