# HG changeset patch # User Ethan Zonca # Date 2010-10-16 02:40:45 # Node ID 18c6d2ea6fe7dc25c799c4ef50cb17e4514301d4 # Parent a091aaf7158ae74f5769a0c540e5f04774f92183 uMich crawler now works properly diff --git a/school.d/umich.inc b/school.d/umich.inc --- a/school.d/umich.inc +++ b/school.d/umich.inc @@ -1,19 +1,20 @@ 'University of Michigan', - 'url' => 'http://umich.edu/', - 'domains' => array( - 'umich.edu', - ), - 'student_address' => 'Wolverine', - ); + 'url' => 'http://umich.edu/', + 'domains' => array( + 'umich.edu', + ), + 'student_address' => 'Wolverine', + ); } function umich_instructions_html() { return <<Umich-specific Instructions +

U of M-specific Instructions

SlatePermutate can be a useful tool for scheduling your next semester.

@@ -28,6 +29,16 @@ function umich_instructions_html() EOF; } +/** Filter out whitepace items */ +function umich_arrayfilter_callback($item){ + if(ltrim($item) == ''){ + return false; + } + else{ + return true; + } +} + /** Parse html at URL into array, first row is row headers */ function umich_table_parse($url) { $arr = array(); @@ -39,13 +50,27 @@ function umich_table_parse($url) { $dom->loadHTML($html); $dom->preserveWhiteSpace = false; $tables = $dom->getElementsByTagName('table'); - $rows = $tables->item(2)->getElementsByTagName('tr'); // Get first table on page + $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page foreach ($rows as $rownum => $row) { - $cols = $row->getElementsByTagName('td'); - foreach($cols as $colnum => $col){ - $arr[$rownum][$colnum] = $col->nodeValue; + if($rownum > 5) { + $cols = $row->getElementsByTagName('td'); + foreach($cols as $colnum => $col){ + $arr[$rownum][$colnum] = $col->nodeValue; + } } } + foreach($arr as &$item) { + $item = array_filter($item, "umich_arrayfilter_callback"); + } + + $arr = array_values($arr); // Reindex array + + // Strip navigation and trailing garbage + $arr[count($arr)-3] = NULL; + $arr[count($arr)-2] = NULL; + $arr[count($arr)-1] = NULL; + + $arr = array_filter($arr); return $arr; } @@ -53,13 +78,14 @@ function umich_table_parse($url) { function umich_crawl($season, $year) { /* Current academic departments. Update as needed. */ $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH'); + $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx"; - $yearsyn = 1810 + $year; // Weird year synonym name where 2000 == 1800 + $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800 $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug"; $season = strtolower($season); $tables = array(); foreach($departments as $department) { - $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000'); + $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000'); } return $tables; }