代码片段

shell:

1，处理locationProbe的linux shell脚本：

cat LocationProbe.csv | cut -d ',' -f 3,20,23|sed '1d'|sort  -u -n>location_dealed.csv

2，将时间戳转换为可读时间的方法：

其中@后面跟随的是时间戳的方式

date -d "@1385991753" "+%Y/%m/%d %H:%M:%S"

shell:

sort

uniq

cut

grep

sed

awk

python:

 import sys
import codecs
import pickle
#from operator import itemgetter
#from random import shuffle
from exceptions import ValueError
from numpy.random import shuffle

from recsys.algorithm import VERBOSE

class Data:
    """
    Handles the relationshops among users and items
    """
    def __init__(self):
        #"""
        #:param data: a list of tuples
        #:type data: list
        #"""
        self._data = list([])

    def __repr__(self):
        s = '%d rows.' % len(self.get())
        if len(self.get()):
            s += '\nE.g: %s' % str(self.get()[0])
        return s

    def __len__(self):
        return len(self.get())

    def __getitem__(self, i):
        if i < len(self._data):
            return self._data[i]
        return None

    def __iter__(self):
        return iter(self.get())

    def set(self, data, extend=False):
        """
        Sets data to the dataset

        :param data: a list of tuples
        :type data: list
        """
        if extend:
            self._data.extend(data)
        else:
            self._data = data

    def get(self):
        """
        :returns: a list of tuples
        """
        return self._data

    def add_tuple(self, tuple):
        """
        :param tuple: a tuple containing <rating, user, item> information (e.g.  <value, row, col>)
        """
        #E.g: tuple = (25, "ocelma", "u2") -> "ocelma has played u2 25 times"
        if not len(tuple) == 3:
            raise ValueError('Tuple format not correct (should be: <value, row_id, col_id>)')
        value, row_id, col_id = tuple
        if not value and value != 0:
            raise ValueError('Value is empty %s' % (tuple,))
        if isinstance(value, basestring):
            raise ValueError('Value %s is a string (must be an int or float) %s' % (value, tuple,))
        if row_id is None or row_id == '':
            raise ValueError('Row id is empty %s' % (tuple,))
        if col_id is None or col_id == '':
            raise ValueError('Col id is empty %s' % (tuple,))
        self._data.append(tuple)

    def split_train_test(self, percent=80, shuffle_data=True):
        """
        Splits the data in two disjunct datasets: train and test

        :param percent: % of training set to be used (test set size = 100-percent)
        :type percent: int
        :param shuffle_data: shuffle dataset?
        :type shuffle_data: Boolean

        :returns: a tuple <Data, Data>
        """
        if shuffle_data:
            shuffle(self._data)
        length = len(self._data)
        train_list = self._data[:int(round(length*percent/100.0))]
        test_list = self._data[-int(round(length*(100-percent)/100.0)):]
        train = Data()
        train.set(train_list)
        test = Data()
        test.set(test_list)

        return train, test

    def load(self, path, force=True, sep='\t', format=None, pickle=False):
        """
        Loads data from a file

        :param path: filename
        :type path: string
        :param force: Cleans already added data
        :type force: Boolean
        :param sep: Separator among the fields of the file content
        :type sep: string
        :param format: Format of the file content. 
            Default format is 'value': 0 (first field), then 'row': 1, and 'col': 2.
            E.g: format={'row':0, 'col':1, 'value':2}. The row is in position 0, 
            then there is the column value, and finally the rating. 
            So, it resembles to a matrix in plain format
        :type format: dict()
        :param pickle: is input file in  pickle format?
        :type pickle: Boolean
        """
        if VERBOSE:
            sys.stdout.write('Loading %s\n' % path)
        if force:
            self._data = list([])
        if pickle:
            self._load_pickle(path)
        else:
            i = 0 
            for line in codecs.open(path, 'r', 'utf8'):
                data = line.strip('\r\n').split(sep)
                value = None
                if not data:
                    raise TypeError('Data is empty or None!')
                if not format:
                    # Default value is 1
                    try:
                        value, row_id, col_id = data
                    except:
                        value = 1
                        row_id, col_id = data
                else:
                    try:
                        # Default value is 1
                        try:
                            value = data[format['value']]
                        except KeyError, ValueError:
                            value = 1
                        try: 
                            row_id = data[format['row']]
                        except KeyError:
                            row_id = data[1]
                        try:
                            col_id = data[format['col']]
                        except KeyError:
                            col_id = data[2]
                        row_id = row_id.strip()
                        col_id = col_id.strip()
                        if format.has_key('ids') and (format['ids'] == int or format['ids'] == 'int'):
                            try:
                                row_id = int(row_id)
                            except:
                                print 'Error (ID is not int) while reading: %s' % data #Just ignore that line
                                continue
                            try:
                                col_id = int(col_id)
                            except:
                                print 'Error (ID is not int) while reading: %s' % data #Just ignore that line
                                continue
                    except IndexError:
                        #raise IndexError('while reading %s' % data)
                        print 'Error while reading: %s' % data #Just ignore that line
                        continue
                # Try to convert ids to int
                try:
                    row_id = int(row_id)
                except: pass
                try:
                    col_id = int(col_id)
                except: pass
                # Add tuple
                try:
                    self.add_tuple((float(value), row_id, col_id))
                except:
                    if VERBOSE:
                        sys.stdout.write('\nError while reading (%s, %s, %s). Skipping this tuple\n' % (value, row_id, col_id))
                    #raise ValueError('%s is not a float, while reading %s' % (value, data))
                i += 1
                if VERBOSE:
                    if i % 100000 == 0:
                        sys.stdout.write('.')
                    if i % 1000000 == 0:
                        sys.stdout.write('|')
                    if i % 10000000 == 0:
                        sys.stdout.write(' (%d M)\n' % int(i/1000000))
            if VERBOSE:
                sys.stdout.write('\n')

    def _load_pickle(self, path):
        """
        Loads data from a pickle file

        :param path: output filename
        :type param: string
        """
        self._data = pickle.load(codecs.open(path))

    def save(self, path, pickle=False):
        """
        Saves data in output file

        :param path: output filename
        :type param: string
        :param pickle: save in pickle format?
        :type pickle: Boolean
        """
        if VERBOSE:
            sys.stdout.write('Saving data to %s\n' % path)
        if pickle:
            self._save_pickle(path)
        else:
            out = codecs.open(path, 'w', 'utf8')
            for value, row_id, col_id in self._data:
                try:
                    value = unicode(value, 'utf8')
                except:
                    if not isinstance(value, unicode):
                        value = str(value)
                try:
                    row_id = unicode(row_id, 'utf8')
                except:
                    if not isinstance(row_id, unicode):
                        row_id = str(row_id)
                try:
                    col_id = unicode(col_id, 'utf8')
                except:
                    if not isinstance(col_id, unicode):
                        col_id = str(col_id)

                s = '\t'.join([value, row_id, col_id])
                out.write(s + '\n')
            out.close()

    def _save_pickle(self, path):
        """
        Saves data in output file, using pickle format

        :param path: output filename
        :type param: string
        """
        pickle.dump(self._data, open(path, "w"))

datamodel

posted @ 2013-12-02 21:41 bobo的学习笔记阅读(325) 评论(0) 收藏举报

刷新页面返回顶部

代码片段

公告