#!/usr/bin/env python
#############################################
# Python wrapper for GaBP application in GraphLab
# By Daniel Zerbino, based on Matlab code by Danny Bickson
#############################################
import sys
import os
import tempfile
import struct
import numpy as np
import subprocess
#############################################
# Convenience binary writer/reader functions:
#############################################
def writeDouble(F, x):
F.write(struct.pack('<d', x))
def writeVector(F, vector):
for X in vector:
writeDouble(F, X)
def writeInt(F, x):
F.write(struct.pack('<i', x))
def readInt(F):
return struct.unpack('<i', F.read(struct.calcsize('i')))[0]
def readDouble(F):
return struct.unpack('<d', F.read(struct.calcsize('d')))[0]
def readVector(F, n):
return [readDouble(F) for i in range(n)]
#############################################
# Convenience MatLab like functions
#############################################
def enumerate(M):
for i in range(np.shape(M)[0]):
for j in range(np.shape(M)[1]):
# Beware of Matlab numbering!!
yield i+1, j+1, M[i,j]
def find(M):
return filter(lambda X: X[2] != 0, enumerate(M))
#############################################
# script for exporting system of linear equations of the type Ax = y to
# graphlab format.
# Written by Danny Bickson, CMU
# Input: fn - output file name
# A - A mxn matrix (if A is square than m=n)
# y - mx1 observation vector
# x - nx1 known solution (optional, if not given will write a vector of zeros)
# sigma - a vector m+nx1 of noise levels (optional, for non-square matrices only)
# Conversion to Python by Daniel Zerbino, UCSC
#############################################
def save_c_gl(fn, A, y, x=None, sigma_y=None, sigma_x=None, square=False):
m, n = np.shape(A)
if len(y) != m:
sys.exit('y vector should be of len as the number of A rows (%i and %i resp.)' % (len(y), m))
if x is not None and len(x) != n:
sys.exit('x vector length should be as the matrix A columns')
if sigma_y is None and sigma_x is not None:
sys.exit("sigma_y should be provided when entering sigma_x")
if sigma_x is None and sigma_y is not None:
sys.exit("sigma_x should be provided when entering sigma_y")
if sigma_x is not None:
if square:
sys.exit('sigma noise level input is allowed only for non-square matrices')
else:
if len(sigma_x) != n:
sys.exit('sigma_x length should be number of cols of A')
if len(sigma_y) != m:
sys.exit('sigma_y length should be number of rows of A')
if square:
# matrix is square, edges are non digonal entries
vals = find((A - np.diag(np.diag(A))))
print "Saving a square matrix A"
else:
# matrix is not square, edges are non zero values
vals = find(A)
print "Saving a non-square matrix A"
F = open(fn, 'wb')
#write matrix size
writeInt(F, m)
writeInt(F, n)
# write y (the observation), x (the solution, if known), diag(A) the
# variance (if known, else default variance of 1)
writeVector(F, y)
if x is not None:
writeVector(F, x)
else:
writeVector(F, (0 for x in range(n)))
if square:
writeVector(F, diag(A))
else:
if sigma_y is not None:
writeVector(F, sigma_y)
writeVector(F, sigma_x)
else:
writeVector(F, (1 for x in range(m+n)))
#write number of edges
assert len(vals) > 0
writeInt(F, len(vals))
# pad with zeros for 64 bit offset
writeInt(F, 0)
if not square:
offset = m
else:
offset = 0
#write all edges
for val in vals:
writeInt(F, val[0])
writeInt(F, val[1] + offset)
writeDouble(F, val[2])
F.close()
#verify written file header
F = open(fn,'rb')
x = readInt(F)
assert x == m
F.close()
print 'Wrote succesfully into file: %s' % fn
########################################################
#script for reading the output of the GaBP GraphLab program into matlab
# returns x = inv(A)*b as computed by GaBP
# returns diag = diag(inv(A)) - an approximation to the main diagonal of
# the inverse matrix of A.
# Written by Danny Bickson, CMU
# Conversion to Python by Daniel Zerbino, UCSC
########################################################
def load_c_gl(filename, columns):
F = open(filename, 'rb')
x = readVector(F, columns)
diag = readVector(F, columns)
F.close()
os.remove(filename)
return x, diag
########################################################
## Wrapper Utility to be used from outside
########################################################
def runGaBP(convergence, A, y, sigma_y=None, x=None, sigma_x=None, square=False):
file, input = tempfile.mkstemp(dir='.')
save_c_gl(input, A, y, x=x, sigma_y=sigma_y, sigma_x=sigma_x, square=square)
args = ['gabp', '--data', input, '--threshold', str(convergence), '--algorithm', '0', '--scheduler=round_robin', '--square']
if not square:
args.append('false')
else:
args.append('true')
print "Running " + " ".join(args)
ret = subprocess.Popen(args, stdout=sys.stdout, stderr=subprocess.STDOUT).wait()
if ret != 0:
sys.exit("GaBP did not complete")
os.remove(input)
x2, diag = load_c_gl(input + ".out", len(x))
return x2, diag
#########################################################
## Unit test
#########################################################
def main():
A = np.array([[0.2785, 0.9649],[0.5469, 0.1576],[0.9575, 0.9706]])
y = np.array([1.2434, 0.7045, 1.9281])
sigma_y= np.array([1e-10, 1e-10, 1e-10])
x = np.array([0, 0])
sigma_x = np.array([1, 1])
convergence = 1e-10
x2, diag = runGaBP(convergence, A, y, sigma_y=sigma_y, x=x, sigma_x=sigma_x)
print 'A'
print A
print 'y'
print y
print 'Initial X'
print x
print 'Initial Error'
print A.dot(x) - y
print 'Final X'
print x2
print 'Final Error'
print A.dot(x2) - y
print 'diag'
print diag
if __name__=='__main__':
main()
Showing posts with label Python. Show all posts
Showing posts with label Python. Show all posts
Saturday, June 4, 2011
Python wrapper for running GraphLab GaBP linear solver
I got from Daniel Zerbino, a postdoc in UCSC, who is working on smoothing genomic sequencing measurements with constraints, a Python script for converting data to GraphLab GaBP format. Here it is:
Saturday, April 16, 2011
Yahoo! KDD CUP using GraphLab - Part 2
Preparing the input
I got from Sanmi Koyejo , a graduate student in University of Austin, Texas, a Python script for converting KDD Yahoo! Cup dataset into GraphLab format. Thanks so much!
It may be preferable to the Matlab script, since it seems that for some users the Matlab script goes out of memory.
I have attached the python script for reading the kdd dataset. The package requires numpy so it can use the file writing method.
Additional information about the conversion procedure is kindly supplied by Yoyo here.
NOTE: For running the resulting files in Graphlab, you will need to have access to a 64 bit machine. (32 bit machine can not load this dataset in its current form).
Sanity check 0: The downloaded file size from Yahoo! should be:
-rw-r--r-- 1 bickson users 134407201 2011-01-24 07:46 testIdx1.txt
-rw-r--r-- 1 bickson users 5967164350 2011-01-24 10:23 trainIdx1.txt
-rw-r--r-- 1 bickson users 104193447 2011-01-24 10:25 validationIdx1.txt
Sanity check 1: The output file size should be:
$ ls -l kddcup*
-rw-r–r– 1 sil sil 4044804416 2011-06-27 18:18 kddcup
-rw-r–r– 1 sil sil 64063376 2011-06-28 12:51 kddcupe
-rw-r–r– 1 sil sil 96095056 2011-06-28 16:22 kddcupt
(Thanks Yoyo!)
Sanity check 2: you can use the md5sum command to verify creation of inputs.
You should get the following numbers:
Sanity check 3: When running the third script, you should see the output:
> data conversion completed
> input nuser: 1000990 , max ID of user read: 1000990
> input nitem: 624961 , max ID of item read: 624959
> input ndays: 6649 , max day read 6649
> input nrate: 6005940 , Number of ratings read: 6005940
> Warning: input parameters differ from output parameters, graphlab pmf may
> not run correctly !!!
I got from Sanmi Koyejo , a graduate student in University of Austin, Texas, a Python script for converting KDD Yahoo! Cup dataset into GraphLab format. Thanks so much!
It may be preferable to the Matlab script, since it seems that for some users the Matlab script goes out of memory.
I have attached the python script for reading the kdd dataset. The package requires numpy so it can use the file writing method.
Additional information about the conversion procedure is kindly supplied by Yoyo here.
NOTE: For running the resulting files in Graphlab, you will need to have access to a 64 bit machine. (32 bit machine can not load this dataset in its current form).
''' Created on Apr 16, 2011 Read KDD cup data (Low Memory) to store in format suitable for graphlab pmf Requires numpy, uses ndarray.tofile() to write the binary file Module uses (A LOT) less memory than readKddData.py by reading and writing one user at a time The tradeoff is that the max user id, item id, days and number of ratings must be known beforehand This is because pmf expects this input in the first line of the File Known Issue: Number of test (-f3) items is 624959, although we hard-code 624961. This restriction comes from a bug(?) in pmf (to be fixed soon) Ignore this warning if this your only issue usage: python readKddLM.py --help python readKddLM.py -i trainIdx.txt -o kddcup -f 1 python readKddLM.py -i validationIdx.txt -o kddcupe -f 2 python readKddLM.py -i testIdx.txt -o kddcupt -f 3 @author: Sanmi Koyejo; [email protected] ''' from optparse import OptionParser from numpy import array, dtype, amax, maximum, zeros, int_ def readLine(fileHandle, splitter='|'): ''' read single line''' line = fileHandle.readline() if not line: #EOF return line # return null and let caller handle it return line.rstrip().split(splitter) # split the line and remove newline character def readChunk(fileHandle, chunkSize, splitter='\t'): '''read a pre-specified chunksize''' for _ in range(chunkSize): line = fileHandle.readline() if not line: #EOF break yield line.rstrip().split(splitter) def readOneUser(fileHandle, testFlag=False, verbose=True): ''' reads data for one user and returns rating Matrix''' while 1: line = readLine(fileHandle) if not line: break # EOF assert(len(line)==2) userID = float(line[0]) nRatings = int(line[1]) rateMat = zeros( (nRatings, 4), dtype=dtype('f4')) rateMat[:,0] = userID+1 # user ID for count, line in enumerate(readChunk(fileHandle, nRatings)): # note allow last user to break nratings constraint. All other users should satisfy this rateMat[count, 1] = float(line[0])+1 # item ID if testFlag: assert(len(line)==3) # error checking rateMat[count, 2] = float(line[1]) # day rateMat[count, 3] = 1.0 # rating else: assert(len(line)==4) rateMat[count, 2] = float(line[2]) # day rateMat[count, 3] = float(line[1]) # rating if verbose and nRatings != count+1: '''User had a different number of items than expected will only work for last user, any difference for other users will trigger assert errors''' print "Warning: Expected", nRatings, "ratings from user; id:", int(userID), ", read", count+1, "ratings." rateMat = rateMat[:count+1,:] yield rateMat def KddDataParser(infile, outfile, size, testFlag, verbose): ''' read data for each user and write to binary format''' # setup storage for max user, item, nratings readLen = 0 readSize = zeros(3, dtype=dtype('i4')) # open reader and writer file handles if verbose: print "opening input file", infile readhandle = open(infile, 'rb') if verbose: print "opening output file", outfile writehandle = open(outfile, 'wb') # write the size information size.tofile(writehandle) # read for each user for count, rateMat in enumerate(readOneUser(readhandle, testFlag, verbose)): readSize = maximum(readSize, int_(amax(rateMat[:,:3], axis=0)) ) # max user, max item, max time readLen += rateMat.shape[0] rateMat[:,1]+=float(size[0]) # itemID = itemID+maxUser rateMat.tofile(writehandle) if verbose: if count%50000 == 0: print 'read', rateMat.shape[0], 'ratings from user', int(rateMat[0,0])-1 # close reader and writer file handles readhandle.close() writehandle.close() if verbose: print "data conversion completed" return readSize, readLen def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True) parser.add_option("-i", "--infile", dest="infile", help="input file name", default="smallTrain.txt") # fixme parser.add_option("-o", "--outfile", dest="outfile", help="output file name", default="kddcupLM") parser.add_option("-f", "--filetype", dest="filetype", type='int', help="training=1, validation=2, test=3", default=1) parser.add_option("-u", "--nuser", dest="nuser", help="max ID of users, if not set, defaults to expected KDD size") parser.add_option("-m", "--nitem", dest="nitem", help="max ID of items, if not set, defaults to expected KDD size") parser.add_option("-t", "--ntime", dest="ntime", help="max number of days, if not set, defaults to expected KDD size") parser.add_option("-r", "--nrate", dest="nrate", help="number of ratings, if not set, defaults to expected KDD size") (options, args) = parser.parse_args() # setup nUser/nitem/nTime defaults based on train/valid/test nuser = 1000990 if options.nuser== None else options.nuser nitem = 624961 if options.nitem== None else options.nitem '''TODO: once pmf is modified, change definition of nitem nitem (train, valid)== 624961 nitem(test)== 624959 Should not affect results''' if options.filetype==1: ntime = 6645 if options.ntime== None else options.ntime nrate = 252800275 if options.nrate== None else options.nrate istest= False elif options.filetype==2: ntime = 6645 if options.ntime== None else options.ntime nrate = 4003960 if options.nrate== None else options.nrate istest= False elif options.filetype==3: ntime = 6649 if options.ntime== None else options.ntime nrate = 6005940 if options.nrate== None else options.nrate istest = True else: errorStr = "--filetype input: "+`options.filetype`+". Allowed values are 1, 2, 3" raise LookupError(errorStr) size = array([nuser, nitem, ntime, nrate], dtype=dtype('i4')) [nUser, nItem, nDays], nRate = KddDataParser(options.infile, options.outfile, size, istest, options.verbose) print 'input nuser:', nuser, ', max ID of user read:', nUser print 'input nitem:', nitem, ', max ID of item read:', nItem print 'input ndays:', ntime, ', max day read', nDays print 'input nrate:', nrate, ', Number of ratings read:', nRate if (nuser!=nUser) or (nitem!=nItem) or (ntime!=nDays) or (nrate!=nRate): print "Warning: input parameters differ from output parameters,", print "graphlab pmf may not run correctly !!!" if __name__ == '__main__': main()
Sanity check 0: The downloaded file size from Yahoo! should be:
-rw-r--r-- 1 bickson users 134407201 2011-01-24 07:46 testIdx1.txt
-rw-r--r-- 1 bickson users 5967164350 2011-01-24 10:23 trainIdx1.txt
-rw-r--r-- 1 bickson users 104193447 2011-01-24 10:25 validationIdx1.txt
Sanity check 1: The output file size should be:
$ ls -l kddcup*
-rw-r–r– 1 sil sil 4044804416 2011-06-27 18:18 kddcup
-rw-r–r– 1 sil sil 64063376 2011-06-28 12:51 kddcupe
-rw-r–r– 1 sil sil 96095056 2011-06-28 16:22 kddcupt
(Thanks Yoyo!)
Sanity check 2: you can use the md5sum command to verify creation of inputs.
You should get the following numbers:
<34|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupe aa76bb1d0e6e897e270ed65d021ed1d8 kddcupe <35|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupt 917599ce7f715890a2705dc04851ac12 kddcupt <36|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcup 345b168a208757b3098c6674b2fb653a kddcupIf you got different output, please check carefully that the command line arguments used are as instructed.
Sanity check 3: When running the third script, you should see the output:
> data conversion completed
> input nuser: 1000990 , max ID of user read: 1000990
> input nitem: 624961 , max ID of item read: 624959
> input ndays: 6649 , max day read 6649
> input nrate: 6005940 , Number of ratings read: 6005940
> Warning: input parameters differ from output parameters, graphlab pmf may
> not run correctly !!!
Subscribe to:
Comments (Atom)