Large Scale Machine Learning and Other Animals: Python

Showing posts with label Python. Show all posts

Saturday, June 4, 2011

Python wrapper for running GraphLab GaBP linear solver

I got from Daniel Zerbino, a postdoc in UCSC, who is working on smoothing genomic sequencing measurements with constraints, a Python script for converting data to GraphLab GaBP format. Here it is:

#!/usr/bin/env python

#############################################
# Python wrapper for GaBP application in GraphLab
# By Daniel Zerbino, based on Matlab code by Danny Bickson
#############################################

import sys
import os
import tempfile
import struct
import numpy as np
import subprocess

#############################################
# Convenience binary writer/reader functions:
#############################################

def writeDouble(F, x):
     F.write(struct.pack('<d', x))

def writeVector(F, vector):
     for X in vector: 
 writeDouble(F, X)

def writeInt(F, x):
     F.write(struct.pack('<i', x))

def readInt(F):
     return struct.unpack('<i', F.read(struct.calcsize('i')))[0]

def readDouble(F):
     return struct.unpack('<d', F.read(struct.calcsize('d')))[0]

def readVector(F, n):
     return [readDouble(F) for i in range(n)]

#############################################
# Convenience MatLab like functions
#############################################

def enumerate(M):
     for i in range(np.shape(M)[0]):
 for j in range(np.shape(M)[1]):
    # Beware of Matlab numbering!!
    yield i+1, j+1, M[i,j]

def find(M):
     return filter(lambda X: X[2] != 0, enumerate(M)) 

#############################################
# script for exporting system of linear equations of the type Ax = y to
# graphlab format.
# Written by Danny Bickson, CMU
# Input: fn - output file name
# A - A mxn matrix (if A is square than m=n)
# y - mx1 observation vector
# x - nx1 known solution (optional, if not given will write a vector of zeros)
# sigma - a vector m+nx1 of noise levels (optional, for non-square matrices only)
# Conversion to Python by Daniel Zerbino, UCSC
#############################################

def save_c_gl(fn, A, y, x=None, sigma_y=None, sigma_x=None, square=False):
    m, n = np.shape(A)
    if len(y) != m:
       sys.exit('y vector should be of len as the number of A rows (%i and %i resp.)' % (len(y), m))

    if x is not None and len(x) != n:
       sys.exit('x vector length should be as the matrix A columns')

    if sigma_y is None and sigma_x is not None:
       sys.exit("sigma_y should be provided when entering sigma_x")

    if sigma_x is None and sigma_y is not None:
       sys.exit("sigma_x should be provided when entering sigma_y")

    if sigma_x is not None:
       if square:
          sys.exit('sigma noise level input is allowed only for non-square matrices')
       else:
          if len(sigma_x) != n:
             sys.exit('sigma_x length should be number of cols of A')
          if len(sigma_y) != m:
             sys.exit('sigma_y length should be number of rows of A')

    if square:
        # matrix is square, edges are non digonal entries
        vals = find((A - np.diag(np.diag(A))))
 print "Saving a square matrix A"
    else:
        # matrix is not square, edges are non zero values
        vals = find(A)
 print "Saving a non-square matrix A"

    F = open(fn, 'wb')

    #write matrix size
    writeInt(F, m)
    writeInt(F, n)

   # write y (the observation), x (the solution, if known), diag(A) the
   # variance (if known, else default variance of 1)
    writeVector(F, y)
    if x is not None:
 writeVector(F, x)
    else:
 writeVector(F, (0 for x in range(n)))

    if square:
 writeVector(F, diag(A))
    else:
        if sigma_y is not None:
     writeVector(F, sigma_y)
     writeVector(F, sigma_x)
        else:
            writeVector(F, (1 for x in range(m+n)))

    #write number of edges
    assert len(vals) > 0
    writeInt(F, len(vals))
    # pad with zeros for 64 bit offset
    writeInt(F, 0)

    if not square:
        offset = m
    else:
        offset = 0

    #write all edges
    for val in vals:
       writeInt(F, val[0])
       writeInt(F, val[1] + offset)
       writeDouble(F, val[2])

    F.close()

    #verify written file header
    F = open(fn,'rb')
    x = readInt(F)
    assert x == m
    F.close()

    print 'Wrote succesfully into file: %s' % fn

########################################################
#script for reading the output of the GaBP GraphLab program into matlab
# returns x = inv(A)*b as computed by GaBP
# returns diag = diag(inv(A)) - an approximation to the main diagonal of
# the inverse matrix of A.
# Written by Danny Bickson, CMU
# Conversion to Python by Daniel Zerbino, UCSC
########################################################

def load_c_gl(filename, columns):
    F = open(filename, 'rb')

    x = readVector(F, columns)
    diag = readVector(F, columns)

    F.close()
    os.remove(filename)
    return x, diag

########################################################
## Wrapper Utility to be used from outside
########################################################

def runGaBP(convergence, A, y, sigma_y=None, x=None, sigma_x=None, square=False):
    file, input = tempfile.mkstemp(dir='.')

    save_c_gl(input, A, y, x=x, sigma_y=sigma_y, sigma_x=sigma_x, square=square)

    args = ['gabp', '--data', input, '--threshold', str(convergence), '--algorithm', '0', '--scheduler=round_robin', '--square']
    if not square:
        args.append('false')
    else:
 args.append('true')
    print "Running " + " ".join(args)
    ret = subprocess.Popen(args, stdout=sys.stdout, stderr=subprocess.STDOUT).wait()

    if ret != 0:
 sys.exit("GaBP did not complete")

    os.remove(input)
    x2, diag = load_c_gl(input + ".out", len(x))
    return x2, diag

#########################################################
## Unit test
#########################################################
def main():
 A = np.array([[0.2785, 0.9649],[0.5469, 0.1576],[0.9575, 0.9706]])
        y = np.array([1.2434, 0.7045, 1.9281])
 sigma_y= np.array([1e-10, 1e-10, 1e-10]) 
        x = np.array([0, 0])
 sigma_x = np.array([1, 1])
        convergence = 1e-10
        x2, diag = runGaBP(convergence, A, y, sigma_y=sigma_y, x=x, sigma_x=sigma_x)

 print 'A'
 print A
 print 'y'
 print y
 print 'Initial X'
 print x
 print 'Initial Error'
 print A.dot(x) - y
 print 'Final X'
 print x2
 print 'Final Error'
 print A.dot(x2) - y
 print 'diag' 
 print diag

if __name__=='__main__':
        main()

Saturday, April 16, 2011

Yahoo! KDD CUP using GraphLab - Part 2

Preparing the input

I got from Sanmi Koyejo , a graduate student in University of Austin, Texas, a Python script for converting KDD Yahoo! Cup dataset into GraphLab format. Thanks so much!

It may be preferable to the Matlab script, since it seems that for some users the Matlab script goes out of memory.

I have attached the python script for reading the kdd dataset. The package requires numpy so it can use the file writing method.

Additional information about the conversion procedure is kindly supplied by Yoyo here.

NOTE: For running the resulting files in Graphlab, you will need to have access to a 64 bit machine. (32 bit machine can not load this dataset in its current form).

'''
Created on Apr 16, 2011
Read KDD cup data (Low Memory) to store in format suitable for graphlab pmf 

Requires numpy, uses ndarray.tofile() to write the binary file

Module uses (A LOT) less memory than readKddData.py by reading and writing one user at a time
The tradeoff is that the max user id, item id, days and number of ratings must be known beforehand
This is because pmf expects this input in the first line of the File

Known Issue: Number of test (-f3) items is 624959, although we hard-code 624961.
This restriction comes from a bug(?) in pmf (to be fixed soon)
Ignore this warning if this your only issue

usage: python readKddLM.py --help
python readKddLM.py -i trainIdx.txt -o kddcup -f 1
python readKddLM.py -i validationIdx.txt -o kddcupe -f 2
python readKddLM.py -i testIdx.txt -o kddcupt -f 3
@author: Sanmi Koyejo; [email protected]
'''

from optparse import OptionParser
from numpy import array, dtype, amax, maximum, zeros, int_

def readLine(fileHandle, splitter='|'):
    ''' read single line'''
    line = fileHandle.readline()
    if not line: #EOF
        return line # return null and let caller handle it
    return line.rstrip().split(splitter) # split the line and remove newline character

def readChunk(fileHandle, chunkSize, splitter='\t'):
    '''read a pre-specified chunksize'''
    for _ in range(chunkSize):
        line = fileHandle.readline()
        if not line: #EOF
            break
        yield line.rstrip().split(splitter)
        
def readOneUser(fileHandle, testFlag=False, verbose=True):
    ''' reads data for one user and returns rating Matrix'''
    
    while 1:
        line = readLine(fileHandle)
        if not line: break # EOF
        assert(len(line)==2)
        userID = float(line[0])
        nRatings = int(line[1])
        
        rateMat = zeros( (nRatings, 4), dtype=dtype('f4'))
        rateMat[:,0] = userID+1 # user ID
        
        for count, line in enumerate(readChunk(fileHandle, nRatings)):
            # note allow last user to break nratings constraint. All other users should satisfy this
            rateMat[count, 1] = float(line[0])+1 # item ID
            
            if testFlag:
                assert(len(line)==3) # error checking
                rateMat[count, 2] = float(line[1]) # day
                rateMat[count, 3] = 1.0 # rating
            else:
                assert(len(line)==4)
                rateMat[count, 2] = float(line[2]) # day
                rateMat[count, 3] = float(line[1]) # rating
        
        if verbose and nRatings != count+1: 
            '''User had a different number of items than expected 
            will only work for last user, any difference for other users will trigger assert errors'''
            print "Warning: Expected", nRatings, "ratings from user; id:", int(userID), ", read", count+1, "ratings."
            rateMat = rateMat[:count+1,:]
        yield rateMat

def KddDataParser(infile, outfile, size, testFlag, verbose):
    ''' read data for each user and write to binary format'''
    
    # setup storage for max user, item, nratings
    readLen = 0
    readSize = zeros(3, dtype=dtype('i4'))
        
    # open reader and writer file handles
    if verbose: print "opening input file", infile
    readhandle = open(infile, 'rb')
    if verbose: print "opening output file", outfile
    writehandle = open(outfile, 'wb')
    
    # write the size information
    size.tofile(writehandle)
    
    # read for each user
    for count, rateMat in enumerate(readOneUser(readhandle, testFlag, verbose)):
        readSize = maximum(readSize, int_(amax(rateMat[:,:3], axis=0)) ) # max user, max item, max time
        readLen  += rateMat.shape[0]  
        
        rateMat[:,1]+=float(size[0]) # itemID = itemID+maxUser
        rateMat.tofile(writehandle)
        
        if verbose: 
            if count%50000 == 0: print 'read', rateMat.shape[0], 'ratings from user', int(rateMat[0,0])-1
    
    # close reader and writer file handles
    readhandle.close()
    writehandle.close()
    
    if verbose: print "data conversion completed"
    
    return readSize, readLen

def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True)
    parser.add_option("-i", "--infile", dest="infile",
                      help="input file name", default="smallTrain.txt") # fixme
    parser.add_option("-o", "--outfile", dest="outfile",
                      help="output file name", default="kddcupLM")
    parser.add_option("-f", "--filetype", dest="filetype", type='int',
                      help="training=1, validation=2, test=3", default=1)
    parser.add_option("-u", "--nuser", dest="nuser",
                      help="max ID of users, if not set, defaults to expected KDD size")
    parser.add_option("-m", "--nitem", dest="nitem",
                      help="max ID of items, if not set, defaults to expected KDD size")
    parser.add_option("-t", "--ntime", dest="ntime",
                      help="max number of days, if not set, defaults to expected KDD size")
    parser.add_option("-r", "--nrate", dest="nrate",
                      help="number of ratings, if not set, defaults to expected KDD size")
    (options, args) = parser.parse_args()
    
    # setup nUser/nitem/nTime defaults based on train/valid/test
    nuser = 1000990 if options.nuser== None else options.nuser 
    nitem = 624961 if options.nitem== None else options.nitem
    '''TODO: once pmf is modified, change definition of nitem
    nitem (train, valid)== 624961
    nitem(test)== 624959 
    Should not affect results'''

    if options.filetype==1:  
        ntime = 6645 if options.ntime== None else options.ntime
        nrate = 252800275 if options.nrate== None else options.nrate
        istest= False
    elif options.filetype==2:
        ntime = 6645 if options.ntime== None else options.ntime
        nrate = 4003960 if options.nrate== None else options.nrate
        istest= False
    elif options.filetype==3:
        ntime = 6649 if options.ntime== None else options.ntime
        nrate = 6005940 if options.nrate== None else options.nrate
        istest   = True
    else:
        errorStr = "--filetype input: "+`options.filetype`+". Allowed values are 1, 2, 3"
        raise LookupError(errorStr)
    
    size = array([nuser, nitem, ntime, nrate], dtype=dtype('i4'))
    
    [nUser, nItem, nDays], nRate = KddDataParser(options.infile, options.outfile, size, istest, options.verbose)
    
    print 'input nuser:', nuser, ', max ID of user read:', nUser
    print 'input nitem:', nitem, ', max ID of item read:', nItem
    print 'input ndays:', ntime, ', max day read', nDays
    print 'input nrate:', nrate, ', Number of ratings read:', nRate
    
    if (nuser!=nUser) or (nitem!=nItem) or (ntime!=nDays) or (nrate!=nRate):
        print "Warning: input parameters differ from output parameters,",
        print "graphlab pmf may not run correctly !!!"
    
if __name__ == '__main__':
    main()

Sanity check 0: The downloaded file size from Yahoo! should be:
-rw-r--r-- 1 bickson users 134407201 2011-01-24 07:46 testIdx1.txt
-rw-r--r-- 1 bickson users 5967164350 2011-01-24 10:23 trainIdx1.txt
-rw-r--r-- 1 bickson users 104193447 2011-01-24 10:25 validationIdx1.txt

Sanity check 1: The output file size should be:
$ ls -l kddcup*
-rw-r–r– 1 sil sil 4044804416 2011-06-27 18:18 kddcup
-rw-r–r– 1 sil sil 64063376 2011-06-28 12:51 kddcupe
-rw-r–r– 1 sil sil 96095056 2011-06-28 16:22 kddcupt
(Thanks Yoyo!)

Sanity check 2: you can use the md5sum command to verify creation of inputs.
You should get the following numbers:

<34|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupe
aa76bb1d0e6e897e270ed65d021ed1d8  kddcupe
<35|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupt
917599ce7f715890a2705dc04851ac12  kddcupt
<36|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcup
345b168a208757b3098c6674b2fb653a  kddcup

If you got different output, please check carefully that the command line arguments used are as instructed.

Sanity check 3: When running the third script, you should see the output:
> data conversion completed
> input nuser: 1000990 , max ID of user read: 1000990
> input nitem: 624961 , max ID of item read: 624959
> input ndays: 6649 , max day read 6649
> input nrate: 6005940 , Number of ratings read: 6005940
> Warning: input parameters differ from output parameters, graphlab pmf may
> not run correctly !!!

Large Scale Machine Learning and Other Animals

Saturday, June 4, 2011

Python wrapper for running GraphLab GaBP linear solver

Saturday, April 16, 2011

Yahoo! KDD CUP using GraphLab - Part 2

Labels

GraphLab Users Google Group

pagerank

google analytics

syntax