Difference between revisions of "Metrics/Data Catalog"

From Creative Commons
Jump to: navigation, search
 
Line 1: Line 1:
 
This page describes raw data available concerning [[Metrics/License statistics|CC license use]].
 
This page describes raw data available concerning [[Metrics/License statistics|CC license use]].
  
See http://labs.creativecommons.org/2011/06/26/powerofopen-metrics/ for some tips on using this data.
+
See http://labs.creativecommons.org/2011/06/27/powerofopen-metrics/ for some tips on using this data.
  
 
== per license link: search engine queries ==
 
== per license link: search engine queries ==

Latest revision as of 03:19, 27 June 2011

This page describes raw data available concerning CC license use.

See http://labs.creativecommons.org/2011/06/27/powerofopen-metrics/ for some tips on using this data.

per license link: search engine queries

The "simple" table contains the results of link: queries for each distinct license URL dispatched to a number of search engines each day. Yahoo's SiteExplorer gives the most useful results -- Google's are very incomplete, presumably to not give link spammers an in theory useful number.

Here's the table description and equivalent table creation statement, an example insertion, and illustrative unique values:

mysql> desc simple;
+-----------------+--------------+------+-----+---------------------+----------------+
| Field           | Type         | Null | Key | Default             | Extra          |
+-----------------+--------------+------+-----+---------------------+----------------+
| id              | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_uri     | varchar(255) | NO   |     |                     |                |
| search_engine   | varchar(255) | NO   |     |                     |                |
| count           | int(11)      | NO   |     | 0                   |                |
| timestamp       | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| jurisdiction    | varchar(255) | YES  |     | NULL                |                |
| license_type    | varchar(16)  | YES  |     | NULL                |                |
| license_version | varchar(16)  | YES  |     | NULL                |                |
+-----------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `simple` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `jurisdiction` varchar(255) default NULL,
  `license_type` varchar(16) default NULL,
  `license_version` varchar(16) default NULL,
  PRIMARY KEY  (`id`),
  KEY `timestamp_index` (`timestamp`)
);

INSERT INTO `simple` VALUES (750584,'http://creativecommons.org/licenses/by-sa/2.0/jp/','MSN',3955,'2006-07-11 16:04:56','jp','by-sa','2.0');

mysql> select now(), count(*) from simple;
+---------------------+----------+
| now()               | count(*) |
+---------------------+----------+
| 2010-03-09 18:21:26 |  2029989 |
+---------------------+----------+

mysql> select count(distinct(license_uri)) from simple;
+------------------------------+
| count(distinct(license_uri)) |
+------------------------------+
|                          540 |
+------------------------------+

mysql> select distinct(search_engine) from simple;
+---------------+
| search_engine |
+---------------+
| MSN           |
| Google        |
| Yahoo         |
| All The Web   |
+---------------+

mysql> select min(timestamp), max(timestamp) from simple;
+---------------------+---------------------+
| min(timestamp)      | max(timestamp)      |
+---------------------+---------------------+
| 2003-05-23 23:20:43 | 2010-02-25 00:05:21 |
+---------------------+---------------------+

mysql> select count(distinct(jurisdiction)) from simple;
+-------------------------------+
| count(distinct(jurisdiction)) |
+-------------------------------+
|                            55 |
+-------------------------------+

mysql> select distinct(license_type) from simple;
+--------------+
| license_type |
+--------------+
| by-nc        |
| by-nc-nd     |
| nc           |
| nd           |
| by-sa        |
| by           |
| nc-sa        |
| nd-nc        |
| sa           |
| by-nd        |
| by-nc-sa     |
| sampling     |
| sampling+    |
| nc-sampling+ |
| publicdomain |
| GPL          |
| LGPL         |
| devnations   |
| NULL         |
| nc-nd        |
| zero         |
+--------------+

mysql> select distinct(license_version) from simple;
+-----------------+
| license_version |
+-----------------+
| 1.0             |
| 2.0             |
| 2.5             |
| 2.1             |
|                 |
| NULL            |
| 3.0             |
+-----------------+

mysql> select distinct(license_uri) from simple where license_version is NULL;
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

These can be removed for many purposes:

mysql> select distinct(license_uri) from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
+--------------------------------------------------+
| license_uri                                      |
+--------------------------------------------------+
| http://creativecommons.org/licenses/GPL/2.0/     |
| http://creativecommons.org/licenses/LGPL/2.1/    |
| http://creativecommons.org                       |
| http://www.creativecommons.org                   |
| http://creativecommons.org/licenses/publicdomain |
+--------------------------------------------------+

mysql> delete from simple where license_version is NULL or license_type = 'GPL' or license_type = 'LGPL';
Query OK, 22822 rows affected (10.56 sec)


License property search engine API queries

Yahoo and Google allow filtering searches by CC license property (eg allows commercial use). This is coarser than per license link: queries, but unlike link: can be (indeed must be) combined with keyword and optionally country and language operators. This facilitates another take on the distribution of CC licenses not dependent on availability or use of jurisdiction ported licenses. Currently only Yahoo is regularly queried. Countries and languages are those supported for Yahoo queries.

See http://labs.creativecommons.org/2010/12/09/available-for-analysis/ for ideas on using this data.

mysql> desc complex;
+-------------------+--------------+------+-----+---------------------+----------------+
| Field             | Type         | Null | Key | Default             | Extra          |
+-------------------+--------------+------+-----+---------------------+----------------+
| id                | int(11)      | NO   | PRI | NULL                | auto_increment |
| license_specifier | varchar(255) | NO   |     |                     |                |
| search_engine     | varchar(255) | NO   |     |                     |                |
| count             | int(11)      | NO   |     | 0                   |                |
| query             | varchar(255) | YES  |     | NULL                |                |
| timestamp         | datetime     | NO   | MUL | 0000-00-00 00:00:00 |                |
| country           | varchar(255) | YES  |     | NULL                |                |
| language          | varchar(255) | YES  |     | NULL                |                |
+-------------------+--------------+------+-----+---------------------+----------------+

CREATE TABLE `complex` (
  `id` int(11) NOT NULL auto_increment,
  `license_specifier` varchar(255) NOT NULL default '',
  `search_engine` varchar(255) NOT NULL default '',
  `count` int(11) NOT NULL default '0',
  `query` varchar(255) default NULL,
  `timestamp` datetime NOT NULL default '0000-00-00 00:00:00',
  `country` varchar(255) default NULL,
  `language` varchar(255) default NULL,
  PRIMARY KEY  (`id`),
  KEY `complex_timestamp_index` (`timestamp`)
);

INSERT INTO `complex` VALUES (32,'cc_any','Yahoo',3,'license','2006-07-11 16:24:42','Norway','portuguese');

mysql> select count(*), license_specifier, search_engine from complex group by license_specifier, search_engine;
+----------+---------------------------------------------------------------------------+---------------+
| count(*) | license_specifier                                                         | search_engine |
+----------+---------------------------------------------------------------------------+---------------+
|   199255 |                                                                           | All The Web   |
|   160438 |                                                                           | Yahoo         |
|      960 | &ccs=c                                                                    | Yahoo         |
|      960 | &ccs=c&ccs=e                                                              | Yahoo         |
|      960 | &ccs=e                                                                    | Yahoo         |
|  4619917 | cc_any                                                                    | Yahoo         |
|      898 | cc_attribute                                                              | Google        |
|  4483821 | cc_commercial                                                             | Yahoo         |
|  4318628 | cc_commercial&cc_modifiable                                               | Yahoo         |
|  4387633 | cc_modifiable                                                             | Yahoo         |
|      900 | cc_noncommercial                                                          | Google        |
|      901 | cc_nonderived                                                             | Google        |
|      900 | cc_publicdomain                                                           | Google        |
|      912 | cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived | Google        |
|      914 | cc_sharealike                                                             | Google        |
+----------+---------------------------------------------------------------------------+---------------+

mysql> select distinct query from complex;
+---------------------+
| query               |
+---------------------+
| license             |
| -license            |
| work                |
| -work               |
| html                |
| -html               |
| license OR -license |
| work OR -work       |
| html OR -html       |
+---------------------+
9 rows in set (25.23 sec)

mysql> select min(timestamp),max(timestamp) from complex;
+---------------------+---------------------+
| min(timestamp)      | max(timestamp)      |
+---------------------+---------------------+
| 2005-06-21 00:00:00 | 2010-02-25 00:05:21 |
+---------------------+---------------------+

mysql> select count(distinct(country)) from complex;
+--------------------------+
| count(distinct(country)) |
+--------------------------+
|                       24 |
+--------------------------+

mysql> select count(distinct(language)) from complex;
+---------------------------+
| count(distinct(language)) |
+---------------------------+
|                        31 |
+---------------------------+


Content repository queries

Obtain number of licensed works from individual repositories (currently only Flickr).

mysql> desc site_specific;
+----------------+--------------+------+-----+---------+----------------+
| Field          | Type         | Null | Key | Default | Extra          |
+----------------+--------------+------+-----+---------+----------------+
| id             | int(11)      | NO   | PRI | NULL    | auto_increment |
| license_uri    | varchar(255) | NO   |     | NULL    |                |
| site           | varchar(255) | NO   |     | NULL    |                |
| count          | int(11)      | NO   |     | NULL    |                |
| utc_time_stamp | datetime     | NO   | MUL | NULL    |                |
+----------------+--------------+------+-----+---------+----------------+

CREATE TABLE `site_specific` (
  `id` int(11) NOT NULL auto_increment,
  `license_uri` varchar(255) NOT NULL,
  `site` varchar(255) NOT NULL,
  `count` int(11) NOT NULL,
  `utc_time_stamp` datetime NOT NULL,
  PRIMARY KEY  (`id`),
  UNIQUE KEY `utc_time_stamp` (`utc_time_stamp`,`license_uri`,`site`),
  KEY `site_specific_timestamp_index` (`utc_time_stamp`)
);

INSERT INTO `site_specific` VALUES (1,'http://creativecommons.org/licenses/by-nd/2.0/','http://www.flickr.com/',317345,'2006-03-17 12:00:00');

mysql> select distinct(license_uri) from site_specific;
+---------------------------------------------------+
| license_uri                                       |
+---------------------------------------------------+
| http://creativecommons.org/licenses/by-nc-nd/2.0/ |
| http://creativecommons.org/licenses/by-nc-sa/2.0/ |
| http://creativecommons.org/licenses/by-nc/2.0/    |
| http://creativecommons.org/licenses/by-nd/2.0/    |
| http://creativecommons.org/licenses/by-sa/2.0/    |
| http://creativecommons.org/licenses/by/2.0/       |
+---------------------------------------------------+

mysql> select min(utc_time_stamp),max(utc_time_stamp) from site_specific;
+---------------------+---------------------+
| min(utc_time_stamp) | max(utc_time_stamp) |
+---------------------+---------------------+
| 2006-03-17 12:00:00 | 2010-02-25 08:05:01 |
+---------------------+---------------------+