Changeset 21884 in project


Ignore:
Timestamp:
12/09/10 20:25:52 (9 years ago)
Author:
petercrlane
Message:

dataset-utils: trunk

Location:
release/4/dataset-utils/trunk
Files:
4 added
1 edited

Legend:

Unmodified
Added
Removed
  • release/4/dataset-utils/trunk/dataset-utils.scm

    r21796 r21884  
    2424(module
    2525  dataset-utils
    26   (export read-arff
     26  (export read-arff
     27          make-numeric-attribute make-nominal-attribute
     28          make-relation
    2729          attribute-name attribute-definition
    2830          relation-name relation-attributes relation-data
    2931          get-attribute-values filter-instances find-attribute-index
     32          split-instances
     33          entropy
    3034          dataset-utils:self-test)
    3135  (import chicken data-structures extras scheme)
     
    4650                 (definition accessor: attribute-definition)))
    4751
     52  (define (make-nominal-attribute name . att-values)
     53    (make <attribute>
     54          'name name
     55          'definition (cons 'nominal att-values)))
     56
     57  (define (make-numeric-attribute name)
     58    (make <attribute>
     59          'name name
     60          'definition (list 'numeric)))
     61
    4862  ;; record for holding information from an ARFF file
    4963  (define-class <relation> ()
     
    5266                 (data accessor: relation-data)))
    5367
     68  (define (make-relation name attributes data)
     69    (make <relation>
     70          'name name
     71          'attributes attributes
     72          'data data))
     73
     74  ;; return the number of instances in given relation
     75  (define (number-instances relation)
     76    (length (relation-data relation)))
     77
    5478  ;; find the index position of named attribute in relation
    5579  (define (find-attribute-index relation name)
     
    5781      (cond ((>= index (length (relation-attributes relation)))
    5882             (error "Attribute not found"))
    59             ((string=? name
    60                        (attribute-name (list-ref (relation-attributes relation)
    61                                                  index)))
     83            ((equal? name
     84                     (attribute-name (list-ref (relation-attributes relation)
     85                                               index)))
    6286             index)
    6387            (else
     
    86110            'data matching-items)))
    87111
     112  ;; given a relation and an attribute name (which should be 'nominal')
     113  ;; return a list of relations, one for each alternative value of the attribute
     114  (define (split-instances relation name)
     115    (map (lambda (value) (filter-instances relation name value))
     116         (delete-duplicates (get-attribute-values relation name))))
     117
     118  ;; compute the proportion (probability) of instances with given name-value pair
     119  ;; in relation
     120  (define (class-probability relation name value)
     121    (/ (number-instances (filter-instances relation name value))
     122       (number-instances relation)))
     123
     124  ;; compute the entropy of given relation for target-class
     125  (define (entropy relation target-class)
     126    (apply +
     127           (map (lambda (value)
     128                  (let ((prob-class (class-probability relation target-class value)))
     129                    (* -1 prob-class (log-2 prob-class))))
     130                (delete-duplicates (get-attribute-values relation target-class)))))
     131 
     132  ; compute the log of a number in base 2, checking for 0
     133  (define (log-2 x)
     134    (if (= x 0)
     135      -1000   ; log is not defined for 0, so return large negative value
     136      (/ (log x)
     137         (log 2))))
     138
    88139  ;; -----------------------------------------------------------------------------
    89140  ;; The ARFF specific procedures
     
    93144  ;; -- return a representation of data in file
    94145  (define (read-arff filename)
     146    (define (make-definition str)
     147      (if (string=? str "numeric")
     148        (list 'numeric)
     149        (cons 'nominal
     150              (map (lambda (str) (string->symbol (string-trim-both str)))
     151                   (string-split (string-substitute* str '(("{|}" . ""))) ",")))))
    95152    (unless (file-exists? filename)
    96153      (error (string-append "unknown filename: " filename)))
     
    115172                           (set! attributes (cons (make <attribute>
    116173                                                        'name (car pieces)
    117                                                         'definition (cadr pieces))
     174                                                        'definition (make-definition (cadr pieces)))
    118175                                                  attributes))))
    119176                        ((is-relation? lline)
     
    121178                        ((and (not (is-keyword? lline))
    122179                              (not in-header?))
    123                          (set! data (cons (string-split lline ",") data)))
     180                         (set! data (cons
     181                                      (map (lambda (str) (if (number? (string->number str))
     182                                                           (string->number str)
     183                                                           (string->symbol str)))
     184                                           (map string-trim-both
     185                                                (string-split lline ",")))
     186                                      data)))
    124187                        (else
    125188                          (error "Error in processing ARFF"))))
     
    189252  ;; export a procedure for running a set of tests on the internal procedures
    190253  (define (dataset-utils:self-test)
    191     (check-set-mode! 'summary)
    192254    (check (is-comment? "") => #f)
    193255    (check (is-comment? "%") => #t)
     
    212274    (check (remove-keyword "@relation    abc") => "abc")
    213275    (check (remove-keyword "@attribute ") => "")
    214     (check-report))
     276    )
    215277  )
    216278
Note: See TracChangeset for help on using the changeset viewer.