Difference between revisions of "Rewrite Metadata Validator/SoC 2008/IRC Scanner"

Latest revision as of 22:12, 19 April 2010

If one wants to search for his or her activity at the IRC channel of Creative Commons, he or she can use Google or the following public domain script written in PHP 5: <source lang="php"><?php

$nick = 'john'; $path = 'http://mirrors.creativecommons.org/irc/cc/'; preg_match_all('/%23cc\.'.date('Y').'\-\d\d\-\d\d\.log\.html/', file_get_contents($path), $matches); $irrelevant = array(); if (file_exists('irrelevant.txt')) {

   $irrelevant = unserialize(file_get_contents('irrelevant.txt'));

} foreach ($matches[0] as $url) {

   if (file_exists('relevant/'.($filename = str_replace('%23', , $url)))
    || in_array($filename, $irrelevant)) {
       echo 'Skipped ', $filename, PHP_EOL;
       continue;
   }
   $contents = file_get_contents($path.$url);

if (!strstr($contents, $nick.'<td class="text"')) { echo 'Irrelevant ', $filename, PHP_EOL; $irrelevant[] = $filename; continue; } file_put_contents('relevant/'.$filename, $contents); echo 'Downloaded ', $filename, PHP_EOL; } file_put_contents('irrelevant.txt', serialize($irrelevant)); echo 'Saved irrelevant.txt', PHP_EOL;</source> Please note that the herewith enclosed script searches the logs saved the current year. All relevant logs are downloaded to the local machine. What follows is the above program translated into Python 2.6 and Python 3.1. <source lang="python">#!/usr/bin/env python

-*- coding: utf-8 -*-

import datetime import os import pickle import re try: import urllib.request as connection except ImportError: import urllib as connection nick = 'john' path = 'http://mirrors.creativecommons.org/irc/cc/' irrelevant = [] if os.path.exists('irrelevant.txt'): f = open('irrelevant.txt', 'r') irrelevant = pickle.load(f) f.close() conn = connection.urlopen(path) index = conn.read().decode('utf-8', 'ignore') conn.close() if not os.path.isdir('relevant'): os.mkdir('relevant') for url in re.findall('%23cc\.{0}\-\d\d\-\d\d\.log\.html'.\ format(datetime.date.today().year), index): filename, contents = url.replace('%23', ), if os.path.exists('relevant/{0}'.format(filename)) or \ filename in irrelevant: print('Skipped {0}'.format(filename)) continue conn = connection.urlopen('{0}{1}'.format(path, url)) contents = conn.read().decode('utf-8', 'ignore') conn.close() if contents.find('{0}<td class="text"'.format(nick)) == -1: print('Irrelevant {0}'.format(filename)) irrelevant.append(filename) continue f = open('relevant/{0}'.format(filename), 'w') f.write(str(contents.encode('ascii', 'xmlcharrefreplace'))) f.close() print('Downloaded {0}'.format(filename)) f = open('irrelevant.txt', 'w+') pickle.dump(irrelevant, f) f.close() print('Saved irrelevant.txt')</source>

Difference between revisions of "Rewrite Metadata Validator/SoC 2008/IRC Scanner"

Latest revision as of 22:12, 19 April 2010

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

default links

wiki navigation

Tools

@@ Line 1: / Line 1: @@
 If one wants to search for his or her activity at [[IRC|the IRC channel of Creative Commons]], he or she can use [http://www.google.com/search?&q=john+site%3Ahttp%3A%2F%2Fmirrors.creativecommons.org%2Firc%2Fcc%2F Google] or the following public domain script written in PHP 5:
-<pre><?php
+<source lang="php"><?php
 $nick = 'john';
@@ Line 25: / Line 25: @@
 }
 file_put_contents('irrelevant.txt', serialize($irrelevant));
-echo 'Saved irrelevant.txt', PHP_EOL;</pre>
+echo 'Saved irrelevant.txt', PHP_EOL;</source>
 Please note that the herewith enclosed script searches the logs saved the current year. All relevant logs are downloaded to the local machine.
+What follows is the above program translated into Python 2.6 and Python 3.1.
+<source lang="python">#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import datetime
+import os
+import pickle
+import re
+try:
+    import urllib.request as connection
+except ImportError:
+    import urllib as connection
+nick = 'john'
+path = 'http://mirrors.creativecommons.org/irc/cc/'
+irrelevant = []
+if os.path.exists('irrelevant.txt'):
+    f = open('irrelevant.txt', 'r')
+    irrelevant = pickle.load(f)
+    f.close()
+conn = connection.urlopen(path)
+index = conn.read().decode('utf-8', 'ignore')
+conn.close()
+if not os.path.isdir('relevant'):
+    os.mkdir('relevant')
+for url in re.findall('%23cc\.{0}\-\d\d\-\d\d\.log\.html'.\
+                      format(datetime.date.today().year), index):
+    filename, contents = url.replace('%23', ''), ''
+    if os.path.exists('relevant/{0}'.format(filename)) or \
+       filename in irrelevant:
+        print('Skipped {0}'.format(filename))
+        continue
+    conn = connection.urlopen('{0}{1}'.format(path, url))
+    contents = conn.read().decode('utf-8', 'ignore')
+    conn.close()
+    if contents.find('{0}</th><td class="text"'.format(nick)) == -1:
+        print('Irrelevant {0}'.format(filename))
+        irrelevant.append(filename)
+        continue
+    f = open('relevant/{0}'.format(filename), 'w')
+    f.write(str(contents.encode('ascii', 'xmlcharrefreplace')))
+    f.close()
+    print('Downloaded {0}'.format(filename))
+f = open('irrelevant.txt', 'w+')
+pickle.dump(irrelevant, f)
+f.close()
+print('Saved irrelevant.txt')</source>