Changeset - 18c6d2ea6fe7
[Not reviewed]
default
0 1 0
Ethan Zonca - 15 years ago 2010-10-16 02:40:45
ez@ethanzonca.com
uMich crawler now works properly
1 file changed with 39 insertions and 13 deletions:
0 comments (0 inline, 0 general)
school.d/umich.inc
Show inline comments
 
<?php
 

	
 
function umich_info()
 
{
 
  return array('name' => 'University of Michigan',
 
	       'url' => 'http://umich.edu/',
 
	       'domains' => array(
 
				  'umich.edu',
 
				  ),
 
	       'student_address' => 'Wolverine',
 
	       );
 
               'url' => 'http://umich.edu/',
 
               'domains' => array(
 
                                  'umich.edu',
 
                                  ),
 
               'student_address' => 'Wolverine',
 
               );
 
}
 

	
 
function umich_instructions_html()
 
{
 
  return <<<EOF
 
<h2>Umich-specific Instructions</h2>
 
<h2>U of M-specific Instructions</h2>
 
<p>
 
  SlatePermutate can be a useful tool for scheduling your next semester.
 
</p>
 
@@ -28,6 +29,16 @@ function umich_instructions_html()
 
EOF;
 
}
 

	
 
/** Filter out whitepace items */
 
function umich_arrayfilter_callback($item){
 
    if(ltrim($item) == ''){
 
      return false;
 
    }
 
    else{
 
      return true;
 
    }
 
}
 

	
 
/** Parse html at URL into array, first row is row headers */
 
function umich_table_parse($url) {
 
  $arr = array();
 
@@ -39,13 +50,27 @@ function umich_table_parse($url) {
 
  $dom->loadHTML($html);
 
  $dom->preserveWhiteSpace = false;
 
  $tables = $dom->getElementsByTagName('table');
 
  $rows = $tables->item(2)->getElementsByTagName('tr'); // Get first table on page 
 
  $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page 
 
  foreach ($rows as $rownum => $row) {
 
    $cols = $row->getElementsByTagName('td');
 
    foreach($cols as $colnum => $col){
 
      $arr[$rownum][$colnum] = $col->nodeValue;
 
    if($rownum > 5) {
 
      $cols = $row->getElementsByTagName('td');
 
      foreach($cols as $colnum => $col){
 
        $arr[$rownum][$colnum] = $col->nodeValue;
 
      }
 
    }
 
  }
 
  foreach($arr as &$item) {
 
    $item = array_filter($item, "umich_arrayfilter_callback");
 
  }
 

	
 
  $arr = array_values($arr); // Reindex array
 
 
 
  // Strip navigation and trailing garbage
 
  $arr[count($arr)-3] = NULL;
 
  $arr[count($arr)-2] = NULL;
 
  $arr[count($arr)-1] = NULL;
 

	
 
  $arr = array_filter($arr);
 
  return $arr;
 
}
 

	
 
@@ -53,13 +78,14 @@ function umich_table_parse($url) {
 
function umich_crawl($season, $year) {
 
  /* Current academic departments. Update as needed. */
 
  $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH');
 

	
 
  $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx";
 
  $yearsyn = 1810 + $year; // Weird year synonym name where 2000 == 1800
 
  $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800
 
  $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug";
 
  $season = strtolower($season);
 
  $tables = array();
 
  foreach($departments as $department) {
 
    $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000');
 
   $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000');
 
  }
 
  return $tables;
 
}
0 comments (0 inline, 0 general)