Changeset - b183b9a9baeb
[Not reviewed]
default
0 2 1
Nathan Brink (binki) - 14 years ago 2011-10-08 02:37:41
ohnobinki@ohnopublishing.net
Support detecting and removing outliers in semester start/end points when crawling. Only enabled for calvin, whose data caused the wrong semester to be selected.
3 files changed with 159 insertions and 11 deletions:
0 comments (0 inline, 0 general)
inc/class.semester.inc
Show inline comments
 
<?php /* -*- mode: php; -*- */
 
<?php /* -*- mode: php; indent-tabs-mode: nil; -*- */
 
/*
 
 * Copyright 2010 Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
 
 *
 
@@ -21,6 +21,7 @@
 
$inc_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'inc' . DIRECTORY_SEPARATOR;
 
require_once($inc_dir . 'class.course.inc');
 
require_once($inc_dir . 'class.section.php');
 
require_once($inc_dir . 'math.inc');
 

	
 
/**
 
 * \brief
 
@@ -69,6 +70,8 @@ class Semester
 
  {
 
    $this->time_start = 0;
 
    $this->time_end = 0;
 
    $this->time_starts = array();
 
    $this->time_ends = array();
 
    $this->season = $season;
 

	
 
    if (strlen($year) != 4 || !is_numeric($year))
 
@@ -259,8 +262,23 @@ class Semester
 
      $this->time_end_set($time_end);
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Add a potential end time to the pool of end times.
 
   */
 
  public function time_end_pool_add($time_end)
 
  {
 
    $this->time_ends[] = $time_end;
 
  }
 

	
 
  public function time_end_get()
 
  {
 
    if (count($this->time_ends))
 
      {
 
        $times = filter_outliers($this->time_ends);
 
        $this->time_end = max($times);
 
      }
 

	
 
    return $this->time_end;
 
  }
 

	
 
@@ -298,8 +316,30 @@ class Semester
 
      $this->time_start_set($time_start);
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Add a potential semester start time to the pool of potential
 
   *   start times.
 
   *
 
   * The idea is that there might be erroneous entries in a school's
 
   * database (
 
   * http://www.facebook.com/CalvinRegistrar/posts/299438720070457 )
 
   * which would skew the detected start time. Use statistics to
 
   * detect and kill outliers by using a pool of endtimes :-D.
 
   */
 
  public function time_start_pool_add($time_start)
 
  {
 
    $this->time_starts[] = $time_start;
 
  }
 

	
 
  public function time_start_get()
 
  {
 
    if (count($this->time_starts))
 
      {
 
        $times = filter_outliers($this->time_starts);
 
        $this->time_end = min($times);
 
      }
 

	
 
    return $this->time_start;
 
  }
 

	
 
@@ -377,5 +417,13 @@ class Semester
 
  public function purge()
 
  {
 
    $this->departments = array();
 
    /*
 
     * Make sure that time_end is set to the proper end time before
 
     * clearing out the pool in the time_ends array.
 
     */
 
    $this->time_end_get();
 
    $this->time_ends = array();
 
    $this->time_start_get();
 
    $this->time_starts = array();
 
  }
 
}
inc/math.inc
Show inline comments
 
new file 100644
 
<?php /* -*- mode: php; indent-tabs-mode: nil; -*- */
 
/*
 
 * Copyright 2011 Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
 
 *
 
 * This file is a part of slate_permutate.
 
 *
 
 * slate_permutate is free software: you can redistribute it and/or modify
 
 * it under the terms of the GNU Affero General Public License as published by
 
 * the Free Software Foundation, either version 3 of the License, or
 
 * (at your option) any later version.
 
 *
 
 * slate_permutate is distributed in the hope that it will be useful,
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
 * GNU Affero General Public License for more details.
 
 *
 
 * You should have received a copy of the GNU Affero General Public License
 
 * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
 
 */
 

	
 
if (!function_exists('mean'))
 
  {
 
    /**
 
     * \brief
 
     *   Calculate the mean of a set of numerical values without
 
     *   overflowing stuff.
 
     */
 
    function mean(array $values)
 
    {
 
      /*
 
       * As the influence of each element reduces with each iteration
 
       * in the used algorithm, shuffling the array should give a
 
       * better idea of what the actual mean is for larger arrays.
 
       */
 
      shuffle($values);
 

	
 
      $val = 0;
 
      $i = 0;
 
      foreach ($values as $value)
 
        {
 
          $val = $val * $i / ($i + 1)
 
            + $value / ($i + 1);
 
          $i ++;
 
        }
 

	
 
      return $val;
 
    }
 
  }
 

	
 
if (!function_exists('stddev'))
 
  {
 
    function stddev(array $values)
 
    {
 
      $mean = mean($values);
 

	
 
      $squares = 0;
 
      $i = 0;
 
      foreach ($values as $value)
 
        $squares += pow($mean - $value, 2);
 
      return sqrt($squares / (count($values) - 1));
 
    }
 
  }
 

	
 
/**
 
 * \brief
 
 *   Return the four quartile points of an array of sorted values with
 
 *   normal integral indexes.
 
 */
 
function sp_iqr(array $values)
 
{
 
  $count = count($values);
 
  if (!$count)
 
    return array(0, 0, 0, 0);
 
  return array(
 
               $values[0],
 
               $values[(int)($count / 4)],
 
               $values[(int)($count / 2)],
 
               $values[(int)(3 * $count / 4)],
 
               $values[$count - 1],
 
               );
 
}
 

	
 
/**
 
 * \brief
 
 *   Remove any `outliers' from an array of values.
 
 *
 
 * An outlier is defined as any value that falls further than 1.5
 
 * standard deviations outside of some sort of inter-quartile range.
 
 */
 
function filter_outliers(array $values)
 
{
 
  sort($values, SORT_NUMERIC);
 
  $values = array_values($values);
 

	
 
  $stddev = stddev($values);
 
  list(, $iqr_min, $iqr_max, ) = sp_iqr($values);
 

	
 
  $min = $iqr_min - 1.5 * $stddev;
 
  $max = $iqr_max + 1.5 * $stddev;
 

	
 
  $count = count($values);
 
  for ($i = 0; $i < $count; $i ++)
 
    if ($values[$i] < $min
 
        || $values[$i] > $max)
 
      unset($values[$i]);
 
  return array_values($values);
 
}
school.d/calvin.crawl.inc
Show inline comments
 
@@ -384,20 +384,16 @@ function calvin_crawl_semester(array $sc
 
	   */
 
	  $date_start_time = strptime($date_start, '%m/%d/%Y');
 
	  $date_end_time = strptime($date_end, '%m/%d/%Y');
 

	
 
	  if ($date_start_time !== FALSE)
 
	    {
 
	      $date_start_time = school_crawl_gmmktime($date_start_time, -5 * 60*60);
 
	      if (!$semester_start_min || $semester_start_min > $date_start_time)
 
		{
 
		  school_crawl_logf($school_crawl_log, 1, "Using section %s for the minimum start time.", $section_id['department'] . '-' . $section_id['course'] . '-' . $section_id['section']);
 
		  $semester_start_min = $date_start_time;
 
		}
 
	      $semester->time_start_pool_add($date_start_time);
 
	    }
 
	  if ($date_end_time !== FALSE)
 
	    {
 
	      $date_end_time = school_crawl_gmmktime($date_end_time, -5 * 60*60);
 
	      if ($semester_end_max < $date_end_time)
 
		$semester_end_max = $date_end_time;
 
	      $semester->time_end_pool_add($date_end_time);
 
	    }
 
	}
 
	}
 
@@ -425,9 +421,6 @@ function calvin_crawl_semester(array $sc
 
      school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num);
 
    }
 

	
 
  $semester->time_end_set($semester_end_max);
 
  $semester->time_start_set($semester_start_min);
 

	
 
  /*
 
   * Calculate lab-based course dependencies.
 
   */
0 comments (0 inline, 0 general)