# HG changeset patch # User Nathan Phillip Brink # Date 2011-10-08 02:37:41 # Node ID b183b9a9baebedb7e48a3822e44316d20410bb22 # Parent 069b10af1b364122a298c7da3023f9703d2b9a4d Support detecting and removing outliers in semester start/end points when crawling. Only enabled for calvin, whose data caused the wrong semester to be selected. diff --git a/inc/class.semester.inc b/inc/class.semester.inc --- a/inc/class.semester.inc +++ b/inc/class.semester.inc @@ -1,4 +1,4 @@ - * @@ -21,6 +21,7 @@ $inc_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'inc' . DIRECTORY_SEPARATOR; require_once($inc_dir . 'class.course.inc'); require_once($inc_dir . 'class.section.php'); +require_once($inc_dir . 'math.inc'); /** * \brief @@ -69,6 +70,8 @@ class Semester { $this->time_start = 0; $this->time_end = 0; + $this->time_starts = array(); + $this->time_ends = array(); $this->season = $season; if (strlen($year) != 4 || !is_numeric($year)) @@ -259,8 +262,23 @@ class Semester $this->time_end_set($time_end); } + /** + * \brief + * Add a potential end time to the pool of end times. + */ + public function time_end_pool_add($time_end) + { + $this->time_ends[] = $time_end; + } + public function time_end_get() { + if (count($this->time_ends)) + { + $times = filter_outliers($this->time_ends); + $this->time_end = max($times); + } + return $this->time_end; } @@ -298,8 +316,30 @@ class Semester $this->time_start_set($time_start); } + /** + * \brief + * Add a potential semester start time to the pool of potential + * start times. + * + * The idea is that there might be erroneous entries in a school's + * database ( + * http://www.facebook.com/CalvinRegistrar/posts/299438720070457 ) + * which would skew the detected start time. Use statistics to + * detect and kill outliers by using a pool of endtimes :-D. + */ + public function time_start_pool_add($time_start) + { + $this->time_starts[] = $time_start; + } + public function time_start_get() { + if (count($this->time_starts)) + { + $times = filter_outliers($this->time_starts); + $this->time_end = min($times); + } + return $this->time_start; } @@ -377,5 +417,13 @@ class Semester public function purge() { $this->departments = array(); + /* + * Make sure that time_end is set to the proper end time before + * clearing out the pool in the time_ends array. + */ + $this->time_end_get(); + $this->time_ends = array(); + $this->time_start_get(); + $this->time_starts = array(); } } diff --git a/inc/math.inc b/inc/math.inc new file mode 100644 --- /dev/null +++ b/inc/math.inc @@ -0,0 +1,107 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +if (!function_exists('mean')) + { + /** + * \brief + * Calculate the mean of a set of numerical values without + * overflowing stuff. + */ + function mean(array $values) + { + /* + * As the influence of each element reduces with each iteration + * in the used algorithm, shuffling the array should give a + * better idea of what the actual mean is for larger arrays. + */ + shuffle($values); + + $val = 0; + $i = 0; + foreach ($values as $value) + { + $val = $val * $i / ($i + 1) + + $value / ($i + 1); + $i ++; + } + + return $val; + } + } + +if (!function_exists('stddev')) + { + function stddev(array $values) + { + $mean = mean($values); + + $squares = 0; + $i = 0; + foreach ($values as $value) + $squares += pow($mean - $value, 2); + return sqrt($squares / (count($values) - 1)); + } + } + +/** + * \brief + * Return the four quartile points of an array of sorted values with + * normal integral indexes. + */ +function sp_iqr(array $values) +{ + $count = count($values); + if (!$count) + return array(0, 0, 0, 0); + return array( + $values[0], + $values[(int)($count / 4)], + $values[(int)($count / 2)], + $values[(int)(3 * $count / 4)], + $values[$count - 1], + ); +} + +/** + * \brief + * Remove any `outliers' from an array of values. + * + * An outlier is defined as any value that falls further than 1.5 + * standard deviations outside of some sort of inter-quartile range. + */ +function filter_outliers(array $values) +{ + sort($values, SORT_NUMERIC); + $values = array_values($values); + + $stddev = stddev($values); + list(, $iqr_min, $iqr_max, ) = sp_iqr($values); + + $min = $iqr_min - 1.5 * $stddev; + $max = $iqr_max + 1.5 * $stddev; + + $count = count($values); + for ($i = 0; $i < $count; $i ++) + if ($values[$i] < $min + || $values[$i] > $max) + unset($values[$i]); + return array_values($values); +} diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc --- a/school.d/calvin.crawl.inc +++ b/school.d/calvin.crawl.inc @@ -384,20 +384,16 @@ function calvin_crawl_semester(array $sc */ $date_start_time = strptime($date_start, '%m/%d/%Y'); $date_end_time = strptime($date_end, '%m/%d/%Y'); + if ($date_start_time !== FALSE) { $date_start_time = school_crawl_gmmktime($date_start_time, -5 * 60*60); - if (!$semester_start_min || $semester_start_min > $date_start_time) - { - school_crawl_logf($school_crawl_log, 1, "Using section %s for the minimum start time.", $section_id['department'] . '-' . $section_id['course'] . '-' . $section_id['section']); - $semester_start_min = $date_start_time; - } + $semester->time_start_pool_add($date_start_time); } if ($date_end_time !== FALSE) { $date_end_time = school_crawl_gmmktime($date_end_time, -5 * 60*60); - if ($semester_end_max < $date_end_time) - $semester_end_max = $date_end_time; + $semester->time_end_pool_add($date_end_time); } } } @@ -425,9 +421,6 @@ function calvin_crawl_semester(array $sc school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num); } - $semester->time_end_set($semester_end_max); - $semester->time_start_set($semester_start_min); - /* * Calculate lab-based course dependencies. */