MIT 6.5830 simpleDB Lab2

Exercise 1

需要完成的是:

  • src/java/simpledb/execution/Predicate.java
  • src/java/simpledb/execution/JoinPredicate.java
  • src/java/simpledb/execution/Filter.java
  • src/java/simpledb/execution/Join.java

这里主要实现两个功能:

  1. Filter:通过Predicate过滤一部分满足条件的Tuple
  2. Join:通过JoinPredicate将满足条件的两个子集的元组连接起来

Predicate.java代码:

package simpledb.execution;

import simpledb.storage.Field;
import simpledb.storage.Tuple;

import java.io.Serializable;

/**
 * Predicate compares tuples to a specified Field value.
 */
public class Predicate implements Serializable {

    private static final long serialVersionUID = 1L;

    private int field;
    private Op op;
    private Field operand;

    /**
     * Constants used for return codes in Field.compare
     */
    public enum Op implements Serializable {
        EQUALS, GREATER_THAN, LESS_THAN, LESS_THAN_OR_EQ, GREATER_THAN_OR_EQ, LIKE, NOT_EQUALS;

        /**
         * Interface to access operations by integer value for command-line
         * convenience.
         *
         * @param i a valid integer Op index
         */
        public static Op getOp(int i) {
            return values()[i];
        }

        public String toString() {
            if (this == EQUALS)
                return "=";
            if (this == GREATER_THAN)
                return ">";
            if (this == LESS_THAN)
                return "<";
            if (this == LESS_THAN_OR_EQ)
                return "<=";
            if (this == GREATER_THAN_OR_EQ)
                return ">=";
            if (this == LIKE)
                return "LIKE";
            if (this == NOT_EQUALS)
                return "<>";
            throw new IllegalStateException("impossible to reach here");
        }

    }

    /**
     * Constructor.
     *
     * @param field   field number of passed in tuples to compare against.
     * @param op      operation to use for comparison
     * @param operand field value to compare passed in tuples to
     */
    public Predicate(int field, Op op, Field operand) {
        // TODO: some code goes here
        this.field = field;
        this.op = op;
        this.operand = operand;
    }

    /**
     * @return the field number
     */
    public int getField() {
        // TODO: some code goes here
        return this.field;
    }

    /**
     * @return the operator
     */
    public Op getOp() {
        // TODO: some code goes here
        return this.op;
    }

    /**
     * @return the operand
     */
    public Field getOperand() {
        // TODO: some code goes here
        return this.operand;
    }

    /**
     * Compares the field number of t specified in the constructor to the
     * operand field specified in the constructor using the operator specific in
     * the constructor. The comparison can be made through Field's compare
     * method.
     *
     * @param t The tuple to compare against
     * @return true if the comparison is true, false otherwise.
     */
    public boolean filter(Tuple t) {
        // TODO: some code goes here
        Field field = t.getField(this.field);
        return field.compare(this.op, this.operand);
    }

    /**
     * Returns something useful, like "f = field_id op = op_string operand =
     * operand_string"
     */
    public String toString() {
        // TODO: some code goes here
        return String.format("f = %d op = %s operand = %s", this.field, this.op.toString(), this.operand.toString());
    }
}

JoinPredicate.java代码:

package simpledb.execution;

import simpledb.storage.Field;
import simpledb.storage.Tuple;

import java.io.Serializable;

/**
 * JoinPredicate compares fields of two tuples using a predicate. JoinPredicate
 * is most likely used by the Join operator.
 */
public class JoinPredicate implements Serializable {

    private static final long serialVersionUID = 1L;
    private int field1;
    private int field2;
    private Predicate.Op op;

    /**
     * Constructor -- create a new predicate over two fields of two tuples.
     *
     * @param field1 The field index into the first tuple in the predicate
     * @param field2 The field index into the second tuple in the predicate
     * @param op     The operation to apply (as defined in Predicate.Op); either
     *               Predicate.Op.GREATER_THAN, Predicate.Op.LESS_THAN,
     *               Predicate.Op.EQUAL, Predicate.Op.GREATER_THAN_OR_EQ, or
     *               Predicate.Op.LESS_THAN_OR_EQ
     * @see Predicate
     */
    public JoinPredicate(int field1, Predicate.Op op, int field2) {
        // TODO: some code goes here
        this.field1 = field1;
        this.field2 = field2;
        this.op = op;
    }

    /**
     * Apply the predicate to the two specified tuples. The comparison can be
     * made through Field's compare method.
     *
     * @return true if the tuples satisfy the predicate.
     */
    public boolean filter(Tuple t1, Tuple t2) {
        // TODO: some code goes here
        if (t1 == null || t2 == null)
            return false;
        Field f1 = t1.getField(field1);
        Field f2 = t2.getField(field2);
        return f1.compare(op, f2);
    }

    public int getField1() {
        // TODO: some code goes here
        return this.field1;
    }

    public int getField2() {
        // TODO: some code goes here
        return this.field2;
    }

    public Predicate.Op getOperator() {
        // TODO: some code goes here
        return this.op;
    }
}

Filter.java代码:

package simpledb.execution;

import simpledb.common.DbException;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;

import java.util.NoSuchElementException;

/**
 * Filter is an operator that implements a relational select.
 */
public class Filter extends Operator {

    private static final long serialVersionUID = 1L;
    private Predicate p;
    private TupleDesc td;
    private OpIterator[] child;

    /**
     * Constructor accepts a predicate to apply and a child operator to read
     * tuples to filter from.
     *
     * @param p     The predicate to filter tuples with
     * @param child The child operator
     */
    public Filter(Predicate p, OpIterator child) {
        // TODO: some code goes here
        this.p = p;
        this.child = new OpIterator[1];
        this.child[0] = child;
        this.td = child.getTupleDesc();
    }

    public Predicate getPredicate() {
        // TODO: some code goes here
        return this.p;
    }

    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return this.td;
    }

    public void open() throws DbException, NoSuchElementException,
            TransactionAbortedException {
        // TODO: some code goes here
        super.open();
        this.child[0].open();
    }

    public void close() {
        // TODO: some code goes here
        super.close();
        this.child[0].close();
    }

    public void rewind() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        this.child[0].rewind();
    }

    /**
     * AbstractDbIterator.readNext implementation. Iterates over tuples from the
     * child operator, applying the predicate to them and returning those that
     * pass the predicate (i.e. for which the Predicate.filter() returns true.)
     *
     * @return The next tuple that passes the filter, or null if there are no
     *         more tuples
     * @see Predicate#filter
     */
    protected Tuple fetchNext() throws NoSuchElementException,
            TransactionAbortedException, DbException {
        // TODO: some code goes here
        while (this.child[0].hasNext()) {
            Tuple t = this.child[0].next();
            if (this.p.filter(t)) {
                return t;
            }
        }
        return null;
    }

    @Override
    public OpIterator[] getChildren() {
        // TODO: some code goes here
        return this.child;
    }

    @Override
    public void setChildren(OpIterator[] children) {
        // TODO: some code goes here
        this.child = children;
    }

}

Join代码:

package simpledb.execution;

import simpledb.common.DbException;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;

import java.util.NoSuchElementException;

/**
 * The Join operator implements the relational join operation.
 */
public class Join extends Operator {

    private static final long serialVersionUID = 1L;
    private JoinPredicate p;
    private OpIterator[] child;
    private TupleDesc td;
    private Tuple curJoinTuple;

    /**
     * Constructor. Accepts two children to join and the predicate to join them
     * on
     *
     * @param p      The predicate to use to join the children
     * @param child1 Iterator for the left(outer) relation to join
     * @param child2 Iterator for the right(inner) relation to join
     */
    public Join(JoinPredicate p, OpIterator child1, OpIterator child2) {
        // TODO: some code goes here
        this.p = p;
        this.child = new OpIterator[2];
        this.child[0] = child1;
        this.child[1] = child2;
        this.td = TupleDesc.merge(child1.getTupleDesc(), child2.getTupleDesc());
    }

    public JoinPredicate getJoinPredicate() {
        // TODO: some code goes here
        return this.p;
    }

    /**
     * @return the field name of join field1. Should be quantified by
     *         alias or table name.
     */
    public String getJoinField1Name() {
        // TODO: some code goes here
        return this.child[0].getTupleDesc().getFieldName(this.p.getField1());
    }

    /**
     * @return the field name of join field2. Should be quantified by
     *         alias or table name.
     */
    public String getJoinField2Name() {
        // TODO: some code goes here
        return this.child[1].getTupleDesc().getFieldName(this.p.getField2());
    }

    /**
     * @see TupleDesc#merge(TupleDesc, TupleDesc) for possible
     *         implementation logic.
     */
    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return this.td;
    }

    public void open() throws DbException, NoSuchElementException,
            TransactionAbortedException {
        // TODO: some code goes here
        super.open();
        this.child[0].open();
        this.child[1].open();
    }

    public void close() {
        // TODO: some code goes here
        super.close();
        this.child[0].close();
        this.child[1].close();
    }

    public void rewind() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        this.child[0].rewind();
        this.child[1].rewind();
    }

    /**
     * Returns the next tuple generated by the join, or null if there are no
     * more tuples. Logically, this is the next tuple in r1 cross r2 that
     * satisfies the join predicate. There are many possible implementations;
     * the simplest is a nested loops join.
     * <p>
     * Note that the tuples returned from this particular implementation of Join
     * are simply the concatenation of joining tuples from the left and right
     * relation. Therefore, if an equality predicate is used there will be two
     * copies of the join attribute in the results. (Removing such duplicate
     * columns can be done with an additional projection operator if needed.)
     * <p>
     * For example, if one tuple is {1,2,3} and the other tuple is {1,5,6},
     * joined on equality of the first column, then this returns {1,2,3,1,5,6}.
     *
     * @return The next matching tuple.
     * @see JoinPredicate#filter
     */
    protected Tuple fetchNext() throws TransactionAbortedException, DbException {
        // TODO: some code goes here
        while (child[0].hasNext() || curJoinTuple != null) {
            if (child[0].hasNext() && curJoinTuple == null) {
                curJoinTuple = child[0].next();
            }
            Tuple right;
            while (child[1].hasNext()) {
                right = child[1].next();
                if (p.filter(curJoinTuple, right)) {
                    int len1 = curJoinTuple.getTupleDesc().numFields();
                    int len2 = right.getTupleDesc().numFields();
                    Tuple tuple = new Tuple(td);
                    for (int i = 0; i < len1; i++) {
                        tuple.setField(i, curJoinTuple.getField(i));
                    }
                    for (int i = 0; i < len2; i++) {
                        tuple.setField(i + len1, right.getField(i));
                    }
                    return tuple;
                }
            }
            curJoinTuple = null;
            child[1].rewind();
        }
        return null;
    }

    @Override
    public OpIterator[] getChildren() {
        // TODO: some code goes here
        return this.child;
    }

    @Override
    public void setChildren(OpIterator[] children) {
        // TODO: some code goes here
        this.child = children;
    }

}

fetchNext方法主要返回下一个由Join生成的元组,这里的实现是一个简单的嵌套循环连接。首先检查左侧子表是否有下一个元组,没有就返回null,有就保存在curJoinTuple中,然后在右侧子表中筛选满足条件的元组,连接在一起,然后重置curJoinTuple和右侧子表,进入下一轮循环。

对Exercise 1执行单元测试和系统测试:

ant runtest -Dtest=PredicateTest
ant runtest -Dtest=JoinPredicateTest
ant runtest -Dtest=FilterTest
ant runtest -Dtest=JoinTest
ant runsystest -Dtest=FilterTest
ant runsystest -Dtest=JoinTest

此时测试应该都是successful的。

Exercise 2

需要完成的代码有:

  • src/java/simpledb/execution/IntegerAggregator.java
  • src/java/simpledb/execution/StringAggregator.java
  • src/java/simpledb/execution/Aggregate.java

这里通过Aggregator接口实现聚合功能,包括COUNT、SUM、AVG、MIN、MAX五个SQL聚合并支持分组,结果中的每个元组都是 (groupValue, aggregateValue) 形式的一对,除非group by字段是NOGROUPING,此时结果中的元组是 (aggregateValue) 形式。

IntegerAggregator.java代码:

package simpledb.execution;

import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.concurrent.ConcurrentHashMap;

import simpledb.common.DbException;
import simpledb.common.Type;
import simpledb.storage.StringField;
import simpledb.storage.Field;
import simpledb.storage.IntField;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;

/**
 * Knows how to compute some aggregate over a set of IntFields.
 */
public class IntegerAggregator implements Aggregator {

    private static final long serialVersionUID = 1L;
    private static final Field NO_GROUP_FIELD = new StringField("NO_GROUP_FIELD", 20);
    private int gbfield;
    private Type gbfieldtype;
    private int afield;
    private Op what;
    private TupleDesc td;

    private Map<Field, GroupCalResult> groupCalMap;
    private Map<Field, Tuple> resultMap;

    private static class GroupCalResult {
        public static final Integer DEFAULT_COUNT = 0;
        public static final Integer Deactivate_COUNT = -1;
        public static final Integer DEFAULT_RES = 0;
        public static final Integer Deactivate_RES = -1;
        private Integer result;
        private Integer count;

        public GroupCalResult(int result, int count) {
            this.result = result;
            this.count = count;
        }
    }

    /**
     * Aggregate constructor
     *
     * @param gbfield     the 0-based index of the group-by field in the tuple, or
     *                    NO_GROUPING if there is no grouping
     * @param gbfieldtype the type of the group by field (e.g., Type.INT_TYPE), or null
     *                    if there is no grouping
     * @param afield      the 0-based index of the aggregate field in the tuple
     * @param what        the aggregation operator
     */

    public IntegerAggregator(int gbfield, Type gbfieldtype, int afield, Op what) {
        // TODO: some code goes here
        this.gbfield = gbfield;
        this.gbfieldtype = gbfieldtype;
        this.afield = afield;
        this.what = what;
        this.groupCalMap = new ConcurrentHashMap<>();
        this.resultMap = new ConcurrentHashMap<>();

        if (this.gbfield == NO_GROUPING) {
            this.td = new TupleDesc(new Type[]{Type.INT_TYPE}, new String[]{"aggregateVal"});
        } else {
            this.td = new TupleDesc(new Type[]{gbfieldtype, Type.INT_TYPE}, new String[]{"groupVal", "aggregateVal"});
        }
    }

    /**
     * Merge a new tuple into the aggregate, grouping as indicated in the
     * constructor
     *
     * @param tup the Tuple containing an aggregate field and a group-by field
     */
    public void mergeTupleIntoGroup(Tuple tup) {
        // TODO: some code goes here
        Field groupByField = this.gbfield == NO_GROUPING ? NO_GROUP_FIELD : tup.getField(this.gbfield);
        if (!NO_GROUP_FIELD.equals(groupByField) && !groupByField.getType().equals(this.gbfieldtype)) {
            throw new IllegalArgumentException("group by field type not match");
        }
        if (!(tup.getField(this.afield) instanceof IntField)) {
            throw new IllegalArgumentException("aggregate field type not match");
        }

        IntField aggregateField = (IntField) tup.getField(this.afield);
        int curVal = aggregateField.getValue();

        switch (this.what) {
            case MIN:
                this.groupCalMap.put(groupByField, new GroupCalResult(Math.min(groupCalMap.getOrDefault(groupByField, new GroupCalResult(Integer.MAX_VALUE, GroupCalResult.Deactivate_COUNT)).result, curVal), GroupCalResult.Deactivate_COUNT));
                break;
            case MAX:
                this.groupCalMap.put(groupByField, new GroupCalResult(Math.max(groupCalMap.getOrDefault(groupByField, new GroupCalResult(Integer.MIN_VALUE, GroupCalResult.Deactivate_COUNT)).result, curVal), GroupCalResult.Deactivate_COUNT));
                break;
            case SUM:
                this.groupCalMap.put(groupByField, new GroupCalResult(groupCalMap.getOrDefault(groupByField, new GroupCalResult(GroupCalResult.DEFAULT_RES, GroupCalResult.Deactivate_COUNT)).result + curVal, GroupCalResult.Deactivate_COUNT));
                break;
            case COUNT:
                this.groupCalMap.put(groupByField, new GroupCalResult(GroupCalResult.Deactivate_RES, groupCalMap.getOrDefault(groupByField, new GroupCalResult(GroupCalResult.Deactivate_RES, GroupCalResult.DEFAULT_COUNT)).count + 1));
                break;
            case AVG:
                GroupCalResult groupCalResult = groupCalMap.getOrDefault(groupByField, new GroupCalResult(GroupCalResult.DEFAULT_RES, GroupCalResult.DEFAULT_COUNT));
                this.groupCalMap.put(groupByField, new GroupCalResult(groupCalResult.result + curVal, groupCalResult.count + 1));
                break;
            case SUM_COUNT:
                break;
            case SC_AVG:
                break;
        }

        Tuple curCalTuple = new Tuple(this.td);
        int curCalRes = 0;
        if (this.what == Op.MIN || this.what == Op.MAX || this.what == Op.SUM) {
            curCalRes = groupCalMap.get(groupByField).result;
        } else if (this.what == Op.COUNT) {
            curCalRes = groupCalMap.get(groupByField).count;
        } else if (this.what == Op.AVG) {
            curCalRes = groupCalMap.get(groupByField).result / groupCalMap.get(groupByField).count;
        }
        if (this.gbfield == NO_GROUPING) {
            curCalTuple.setField(0, new IntField(curCalRes));
        } else {
            curCalTuple.setField(0, groupByField);
            curCalTuple.setField(1, new IntField(curCalRes));
        }

        this.resultMap.put(groupByField, curCalTuple);
    }

    /**
     * Create a OpIterator over group aggregate results.
     *
     * @return a OpIterator whose tuples are the pair (groupVal, aggregateVal)
     *         if using group, or a single (aggregateVal) if no grouping. The
     *         aggregateVal is determined by the type of aggregate specified in
     *         the constructor.
     */
    public OpIterator iterator() {
        // TODO: some code goes here
        return new IntAggTupIterator();
    }

    private class IntAggTupIterator implements OpIterator {
        private boolean isopen = false;
        private Iterator<Map.Entry<Field, Tuple>> iterator;

        @Override
        public void open() throws DbException, TransactionAbortedException {
            this.iterator = resultMap.entrySet().iterator();
            this.isopen = true;
        }

        @Override
        public void close() {
            this.isopen = false;
        }

        @Override
        public void rewind() throws DbException, TransactionAbortedException {
            if (!this.isopen) {
                throw new DbException("Iterator is not open");
            }
            close();
            open();
        }

        @Override
        public boolean hasNext() throws DbException, TransactionAbortedException {
            if (!this.isopen) {
                throw new DbException("Iterator is not open");
            }
            return this.iterator.hasNext();
        }

        @Override
        public Tuple next() throws DbException, TransactionAbortedException, NoSuchElementException {
            if (!this.isopen) {
                throw new DbException("Iterator is not open");
            }
            if (!hasNext()) {
                throw new NoSuchElementException("No more tuples");
            }
            Map.Entry<Field, Tuple> entry = this.iterator.next();
            return entry.getValue();
        }

        @Override
        public TupleDesc getTupleDesc() {
            return td;
        }
    }
}

这里定义了一个内部类GroupCalResult,保存计算的结果和计数,以及四个常量DEFAULT_COUNT、Deactivate_COUNT、DEFAULT_RES、Deactivate_RES分别表示默认计数(0)、停用计数标志(-1)、默认结果(0)和停用结果标志(-1)。

mergeTupleIntoGroup方法针对五种聚合操作实现了对应的功能,用Map<Field, GroupCalResult>保存group by字段的计算结果,Map<Field, Tuple>保存group by字段的元组结果,因此IntAggTupIterator也是用Iterator<Map.Entry<Field, Tuple>>来做迭代器。

此外,每个方法都需要注意处理NOGROUPING的情况。

StringAggregator.java代码:

package simpledb.execution;

import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import simpledb.common.DbException;
import simpledb.common.Type;
import simpledb.storage.StringField;
import simpledb.storage.Field;
import simpledb.storage.IntField;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;

/**
 * Knows how to compute some aggregate over a set of StringFields.
 */
public class StringAggregator implements Aggregator {

    private static final long serialVersionUID = 1L;
    private static final Field NO_GROUP_FIELD = new StringField("NO_GROUP_FIELD", 20);
    private int gbfield;
    private Type gbfieldtype;
    private int afield;
    private TupleDesc td;

    private Map<Field, Integer> groupCalMap;
    private Map<Field, Tuple> resultMap;

    /**
     * Aggregate constructor
     *
     * @param gbfield     the 0-based index of the group-by field in the tuple, or NO_GROUPING if there is no grouping
     * @param gbfieldtype the type of the group by field (e.g., Type.INT_TYPE), or null if there is no grouping
     * @param afield      the 0-based index of the aggregate field in the tuple
     * @param what        aggregation operator to use -- only supports COUNT
     * @throws IllegalArgumentException if what != COUNT
     */

    public StringAggregator(int gbfield, Type gbfieldtype, int afield, Op what) {
        // TODO: some code goes here
        if (what != Op.COUNT) {
            throw new IllegalArgumentException("StringAggregator only supports COUNT");
        }

        this.gbfield = gbfield;
        this.gbfieldtype = gbfieldtype;
        this.afield = afield;
        
        this.groupCalMap = new ConcurrentHashMap<>();
        this.resultMap = new ConcurrentHashMap<>();

        if (this.gbfield == NO_GROUPING) {
            this.td = new TupleDesc(new Type[]{Type.INT_TYPE}, new String[]{"aggregateVal"});
        } else {
            this.td = new TupleDesc(new Type[]{this.gbfieldtype, Type.INT_TYPE}, new String[]{"groupVal", "aggregateVal"});
        }
    }

    /**
     * Merge a new tuple into the aggregate, grouping as indicated in the constructor
     *
     * @param tup the Tuple containing an aggregate field and a group-by field
     */
    public void mergeTupleIntoGroup(Tuple tup) {
        // TODO: some code goes here
        Field groupField = this.gbfield == NO_GROUPING ? NO_GROUP_FIELD : tup.getField(this.gbfield);
        if (!NO_GROUP_FIELD.equals(groupField) && groupField.getType() != this.gbfieldtype) {
            throw new IllegalArgumentException("group field type mismatch");
        }
        if (!(tup.getField(this.afield) instanceof StringField)) {
            throw new IllegalArgumentException("aggregate field type mismatch");
        }

        this.groupCalMap.put(groupField, this.groupCalMap.getOrDefault(groupField, 0) + 1);
        Tuple curCaTuple = new Tuple(td);
        if (this.gbfield == NO_GROUPING) {
            curCaTuple.setField(0, new IntField(this.groupCalMap.get(groupField)));
        } else {
            curCaTuple.setField(0, groupField);
            curCaTuple.setField(1, new IntField(this.groupCalMap.get(groupField)));
        }

        resultMap.put(groupField, curCaTuple);
    }

    /**
     * Create a OpIterator over group aggregate results.
     *
     * @return a OpIterator whose tuples are the pair (groupVal,
     *         aggregateVal) if using group, or a single (aggregateVal) if no
     *         grouping. The aggregateVal is determined by the type of
     *         aggregate specified in the constructor.
     */
    public OpIterator iterator() {
        // TODO: some code goes here
        return new StringAggTupIterator();
    }

    private class StringAggTupIterator implements OpIterator {
        private boolean isOpen = false;
        private Iterator<Map.Entry<Field, Tuple>> it;

        @Override
        public void open() throws DbException, TransactionAbortedException {
            it = resultMap.entrySet().iterator();
            isOpen = true;
        }

        @Override
        public void close() {
            isOpen = false;
        }

        @Override
        public boolean hasNext() throws DbException, TransactionAbortedException {
            if (!isOpen) {
                throw new DbException("Iterator is not open");
            }
            return it.hasNext();
        }

        @Override
        public Tuple next() throws DbException, TransactionAbortedException {
            if (!isOpen) {
                throw new DbException("Iterator is not open");
            }
            return it.next().getValue();
        }

        @Override
        public void rewind() throws DbException, TransactionAbortedException {
            if (!isOpen) {
                throw new DbException("Iterator is not open");
            }
            close();
            open();
        }

        @Override
        public TupleDesc getTupleDesc() {
            return td;
        }
    }
}

String类型的处理类似于Integer类型但相对简单,注释说明了只支持COUNT,聚合操作就是每次找到group by字段的结果然后+1。

Aggregate.java代码:

package simpledb.execution;

import simpledb.common.DbException;
import simpledb.common.Type;
import simpledb.execution.Aggregator.Op;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;

import java.util.NoSuchElementException;

/**
 * The Aggregation operator that computes an aggregate (e.g., sum, avg, max,
 * min). Note that we only support aggregates over a single column, grouped by a
 * single column.
 */
public class Aggregate extends Operator {

    private static final long serialVersionUID = 1L;

    private OpIterator[] child;
    private int afield;
    private int gfield;
    private Aggregator.Op aop;
    private TupleDesc td;
    private Aggregator aggregator;
    private OpIterator iterator;

    /**
     * Constructor.
     * <p>
     * Implementation hint: depending on the type of afield, you will want to
     * construct an {@link IntegerAggregator} or {@link StringAggregator} to help
     * you with your implementation of readNext().
     *
     * @param child  The OpIterator that is feeding us tuples.
     * @param afield The column over which we are computing an aggregate.
     * @param gfield The column over which we are grouping the result, or -1 if
     *               there is no grouping
     * @param aop    The aggregation operator to use
     */
    public Aggregate(OpIterator child, int afield, int gfield, Aggregator.Op aop) {
        // TODO: some code goes here
        this.child = new OpIterator[]{child};
        this.afield = afield;
        this.gfield = gfield;
        this.aop = aop;
        
        if (gfield == Aggregator.NO_GROUPING) {
            this.td = new TupleDesc(new Type[]{child.getTupleDesc().getFieldType(afield)}, new String[]{child.getTupleDesc().getFieldName(afield)});
        } else {
            this.td = new TupleDesc(new Type[]{child.getTupleDesc().getFieldType(gfield), child.getTupleDesc().getFieldType(afield)}, new String[]{child.getTupleDesc().getFieldName(gfield), child.getTupleDesc().getFieldName(afield)});
        }

        if (child.getTupleDesc().getFieldType(afield) == Type.INT_TYPE) {
            this.aggregator = new IntegerAggregator(gfield, gfield == Aggregator.NO_GROUPING ? null : child.getTupleDesc().getFieldType(gfield), afield, aop);
        } else {
            this.aggregator = new StringAggregator(gfield, gfield == Aggregator.NO_GROUPING ? null : child.getTupleDesc().getFieldType(gfield), afield, aop);
        }
    }

    /**
     * @return If this aggregate is accompanied by a groupby, return the groupby
     *         field index in the <b>INPUT</b> tuples. If not, return
     *         {@link Aggregator#NO_GROUPING}
     */
    public int groupField() {
        // TODO: some code goes here
        return this.gfield;
    }

    /**
     * @return If this aggregate is accompanied by a group by, return the name
     *         of the groupby field in the <b>OUTPUT</b> tuples. If not, return
     *         null;
     */
    public String groupFieldName() {
        // TODO: some code goes here
        return this.td.getFieldName(gfield);
    }

    /**
     * @return the aggregate field
     */
    public int aggregateField() {
        // TODO: some code goes here
        return this.afield;
    }

    /**
     * @return return the name of the aggregate field in the <b>OUTPUT</b>
     *         tuples
     */
    public String aggregateFieldName() {
        // TODO: some code goes here
        return this.td.getFieldName(afield);
    }

    /**
     * @return return the aggregate operator
     */
    public Aggregator.Op aggregateOp() {
        // TODO: some code goes here
        return this.aop;
    }

    public static String nameOfAggregatorOp(Aggregator.Op aop) {
        return aop.toString();
    }

    public void open() throws NoSuchElementException, DbException,
            TransactionAbortedException {
        // TODO: some code goes here
        super.open();
        this.child[0].open();
        while (this.child[0].hasNext()) {
            this.aggregator.mergeTupleIntoGroup(this.child[0].next());
        }
        this.iterator = this.aggregator.iterator();
        this.iterator.open();
    }

    /**
     * Returns the next tuple. If there is a group by field, then the first
     * field is the field by which we are grouping, and the second field is the
     * result of computing the aggregate. If there is no group by field, then
     * the result tuple should contain one field representing the result of the
     * aggregate. Should return null if there are no more tuples.
     */
    protected Tuple fetchNext() throws TransactionAbortedException, DbException {
        // TODO: some code goes here
        if (this.iterator.hasNext()) {
            return this.iterator.next();
        }
        return null;
    }

    public void rewind() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        this.child[0].rewind();
        this.iterator.rewind();
    }

    /**
     * Returns the TupleDesc of this Aggregate. If there is no group by field,
     * this will have one field - the aggregate column. If there is a group by
     * field, the first field will be the group by field, and the second will be
     * the aggregate value column.
     * <p>
     * The name of an aggregate column should be informative. For example:
     * "aggName(aop) (child_td.getFieldName(afield))" where aop and afield are
     * given in the constructor, and child_td is the TupleDesc of the child
     * iterator.
     */
    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return this.td;
    }

    public void close() {
        // TODO: some code goes here
        this.iterator.close();
        this.child[0].close();
    }

    @Override
    public OpIterator[] getChildren() {
        // TODO: some code goes here
        return this.child;
    }

    @Override
    public void setChildren(OpIterator[] children) {
        // TODO: some code goes here
        this.child = children;
    }

}

Aggregate实现了最终的聚合操作,会调用前面实现的IntegerAggregator和StringAggregator。需要注意的是聚合结果的TupleDesc:

if (gfield == Aggregator.NO_GROUPING) {
    this.td = new TupleDesc(new Type[]{child.getTupleDesc().getFieldType(afield)}, new String[]{child.getTupleDesc().getFieldName(afield)});
} else {
    this.td = new TupleDesc(new Type[]{child.getTupleDesc().getFieldType(gfield), child.getTupleDesc().getFieldType(afield)}, new String[]{child.getTupleDesc().getFieldName(gfield), child.getTupleDesc().getFieldName(afield)});
}

要根据分组字段和聚合字段按顺序排列处理。

对Exercise 2执行单元测试和系统测试:

ant runtest -Dtest=IntegerAggregatorTest
ant runtest -Dtest=StringAggregatorTest
ant runtest -Dtest=AggregateTest
ant runsystest -Dtest=AggregateTest

此时测试应该都是successful的。

Exercise 3

需要完成的是以下文件中剩余的部分:

  • src/java/simpledb/storage/HeapPage.java
  • src/java/simpledb/storage/HeapFile.java

以及src/simpledb/BufferPool.java中的:

  • insertTuple()
  • deleteTuple()

这里主要实现的是添加元组和删除元组的方法,同时会涉及到一些对页面的修改。

HeapPage.java代码:

package simpledb.storage;

import simpledb.common.Catalog;
import simpledb.common.Database;
import simpledb.common.DbException;
import simpledb.transaction.TransactionId;

import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * Each instance of HeapPage stores data for one page of HeapFiles and
 * implements the Page interface that is used by BufferPool.
 *
 * @see HeapFile
 * @see BufferPool
 */
public class HeapPage implements Page {

    final HeapPageId pid;
    final TupleDesc td;
    final byte[] header;
    final Tuple[] tuples;
    final int numSlots;

    byte[] oldData;
    private final Byte oldDataLock = (byte) 0;

    private TransactionId dirtyTid;
    private boolean dirtyFlag;

    /**
     * Create a HeapPage from a set of bytes of data read from disk.
     * The format of a HeapPage is a set of header bytes indicating
     * the slots of the page that are in use, some number of tuple slots.
     * Specifically, the number of tuples is equal to: <p>
     * floor((BufferPool.getPageSize()*8) / (tuple size * 8 + 1))
     * <p> where tuple size is the size of tuples in this
     * database table, which can be determined via {@link Catalog#getTupleDesc}.
     * The number of 8-bit header words is equal to:
     * <p>
     * ceiling(no. tuple slots / 8)
     * <p>
     *
     * @see Database#getCatalog
     * @see Catalog#getTupleDesc
     * @see BufferPool#getPageSize()
     */
    public HeapPage(HeapPageId id, byte[] data) throws IOException {
        this.pid = id;
        this.td = Database.getCatalog().getTupleDesc(id.getTableId());
        this.numSlots = getNumTuples();
        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(data));

        // allocate and read the header slots of this page
        header = new byte[getHeaderSize()];
        for (int i = 0; i < header.length; i++)
            header[i] = dis.readByte();

        tuples = new Tuple[numSlots];
        try {
            // allocate and read the actual records of this page
            for (int i = 0; i < tuples.length; i++)
                tuples[i] = readNextTuple(dis, i);
        } catch (NoSuchElementException e) {
            e.printStackTrace();
        }
        dis.close();

        setBeforeImage();
    }

    /**
     * Retrieve the number of tuples on this page.
     *
     * @return the number of tuples on this page
     */
    private int getNumTuples() {
        // TODO: some code goes here
        return (BufferPool.getPageSize() * 8) / (td.getSize() * 8 + 1);

    }

    /**
     * Computes the number of bytes in the header of a page in a HeapFile with each tuple occupying tupleSize bytes
     *
     * @return the number of bytes in the header of a page in a HeapFile with each tuple occupying tupleSize bytes
     */
    private int getHeaderSize() {

        // TODO: some code goes here
        return (int) Math.ceil((double) getNumTuples() / 8);

    }

    /**
     * Return a view of this page before it was modified
     * -- used by recovery
     */
    public HeapPage getBeforeImage() {
        try {
            byte[] oldDataRef = null;
            synchronized (oldDataLock) {
                oldDataRef = oldData;
            }
            return new HeapPage(pid, oldDataRef);
        } catch (IOException e) {
            e.printStackTrace();
            //should never happen -- we parsed it OK before!
            System.exit(1);
        }
        return null;
    }

    public void setBeforeImage() {
        synchronized (oldDataLock) {
            oldData = getPageData().clone();
        }
    }

    /**
     * @return the PageId associated with this page.
     */
    public HeapPageId getId() {
        // TODO: some code goes here
        return this.pid;
    }

    /**
     * Suck up tuples from the source file.
     */
    private Tuple readNextTuple(DataInputStream dis, int slotId) throws NoSuchElementException {
        // if associated bit is not set, read forward to the next tuple, and
        // return null.
        if (!isSlotUsed(slotId)) {
            for (int i = 0; i < td.getSize(); i++) {
                try {
                    dis.readByte();
                } catch (IOException e) {
                    throw new NoSuchElementException("error reading empty tuple");
                }
            }
            return null;
        }

        // read fields in the tuple
        Tuple t = new Tuple(td);
        RecordId rid = new RecordId(pid, slotId);
        t.setRecordId(rid);
        try {
            for (int j = 0; j < td.numFields(); j++) {
                Field f = td.getFieldType(j).parse(dis);
                t.setField(j, f);
            }
        } catch (java.text.ParseException e) {
            e.printStackTrace();
            throw new NoSuchElementException("parsing error!");
        }

        return t;
    }

    /**
     * Generates a byte array representing the contents of this page.
     * Used to serialize this page to disk.
     * <p>
     * The invariant here is that it should be possible to pass the byte
     * array generated by getPageData to the HeapPage constructor and
     * have it produce an identical HeapPage object.
     *
     * @return A byte array correspond to the bytes of this page.
     * @see #HeapPage
     */
    public byte[] getPageData() {
        int len = BufferPool.getPageSize();
        ByteArrayOutputStream baos = new ByteArrayOutputStream(len);
        DataOutputStream dos = new DataOutputStream(baos);

        // create the header of the page
        for (byte b : header) {
            try {
                dos.writeByte(b);
            } catch (IOException e) {
                // this really shouldn't happen
                e.printStackTrace();
            }
        }

        // create the tuples
        for (int i = 0; i < tuples.length; i++) {

            // empty slot
            if (!isSlotUsed(i)) {
                for (int j = 0; j < td.getSize(); j++) {
                    try {
                        dos.writeByte(0);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }

                }
                continue;
            }

            // non-empty slot
            for (int j = 0; j < td.numFields(); j++) {
                Field f = tuples[i].getField(j);
                try {
                    f.serialize(dos);

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        // padding
        int zerolen = BufferPool.getPageSize() - (header.length + td.getSize() * tuples.length); //- numSlots * td.getSize();
        byte[] zeroes = new byte[zerolen];
        try {
            dos.write(zeroes, 0, zerolen);
        } catch (IOException e) {
            e.printStackTrace();
        }

        try {
            dos.flush();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return baos.toByteArray();
    }

    /**
     * Static method to generate a byte array corresponding to an empty
     * HeapPage.
     * Used to add new, empty pages to the file. Passing the results of
     * this method to the HeapPage constructor will create a HeapPage with
     * no valid tuples in it.
     *
     * @return The returned ByteArray.
     */
    public static byte[] createEmptyPageData() {
        int len = BufferPool.getPageSize();
        return new byte[len]; //all 0
    }

    /**
     * Delete the specified tuple from the page; the corresponding header bit should be updated to reflect
     * that it is no longer stored on any page.
     *
     * @param t The tuple to delete
     * @throws DbException if this tuple is not on this page, or tuple slot is
     *                     already empty.
     */
    public void deleteTuple(Tuple t) throws DbException {
        // TODO: some code goes here
        // not necessary for lab1
        int tid = t.getRecordId().getTupleNumber();
        boolean isExist = false;
        if (tid >= 0 && tid <= tuples.length && t.equals(tuples[tid])) {
            if (!isSlotUsed(tid)) {
                throw new DbException("tuple slot is already empty");
            }
            markSlotUsed(tid, false);
            tuples[tid] = null;
            isExist = true;
        }
        if (!isExist) {
            throw new DbException("this tuple is not on this page");
        }
    }

    /**
     * Adds the specified tuple to the page;  the tuple should be updated to reflect
     * that it is now stored on this page.
     *
     * @param t The tuple to add.
     * @throws DbException if the page is full (no empty slots) or tupledesc
     *                     is mismatch.
     */
    public void insertTuple(Tuple t) throws DbException {
        // TODO: some code goes here
        // not necessary for lab1
        if (getNumUnusedSlots() == 0 || !t.getTupleDesc().equals(td)) {
            throw new DbException("page is full or tupledesc is mismatch");
        }
        for (int i = 0; i < numSlots; ++i) {
            if (!isSlotUsed(i)) {
                markSlotUsed(i, true);
                t.setRecordId(new RecordId(pid, i));
                tuples[i] = t;
                return;
            }
        }
    }

    /**
     * Marks this page as dirty/not dirty and record that transaction
     * that did the dirtying
     */
    public void markDirty(boolean dirty, TransactionId tid) {
        // TODO: some code goes here
        // not necessary for lab1
        this.dirtyFlag = dirty;
        this.dirtyTid = tid;
    }

    /**
     * Returns the tid of the transaction that last dirtied this page, or null if the page is not dirty
     */
    public TransactionId isDirty() {
        // TODO: some code goes here
        // Not necessary for lab1
        return this.dirtyFlag ? this.dirtyTid : null;
    }

    /**
     * Returns the number of unused (i.e., empty) slots on this page.
     */
    public int getNumUnusedSlots() {
        // TODO: some code goes here
        int cnt = 0;
        for (int i = 0; i < numSlots; ++i) {
            if (!isSlotUsed(i)) {
                ++cnt;
            }
        }
        return cnt;
    }

    /**
     * Returns true if associated slot on this page is filled.
     */
    public boolean isSlotUsed(int i) {
        // TODO: some code goes here
        int iTh = i / 8;
        int bitTh = i % 8;
        int onBit = (header[iTh] >> bitTh) & 1;
        return onBit == 1;
    }

    /**
     * Abstraction to fill or clear a slot on this page.
     */
    private void markSlotUsed(int i, boolean value) {
        // TODO: some code goes here
        // not necessary for lab1
        int iTh = i / 8;
        int bitTh = i % 8;
        int onBit = (header[iTh] >> bitTh) & 1;
        if (onBit == 0 && value) {
            header[iTh] |= (1 << bitTh);
        } else if (onBit == 1 && !value) {
            header[iTh] &= ~(1 << bitTh);
        }
    }

    /**
     * @return an iterator over all tuples on this page (calling remove on this iterator throws an UnsupportedOperationException)
     *         (note that this iterator shouldn't return tuples in empty slots!)
     */
    public Iterator<Tuple> iterator() {
        // TODO: some code goes here
        List<Tuple> tupleList = new ArrayList<>();
        for (int i = 0; i < numSlots; ++i) {
            if (isSlotUsed(i)) {
                tupleList.add(tuples[i]);
            }
        }
        return tupleList.iterator();
    }

}

deleteTuple直接取TupleNumber,判断合法后修改header并置为null即可。insertTuple需要多一步搜索未使用槽位的步骤。

HeapFile.java代码:

package simpledb.storage;

import simpledb.common.Database;
import simpledb.common.DbException;
import simpledb.common.Permissions;
import simpledb.transaction.TransactionAbortedException;
import simpledb.transaction.TransactionId;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * HeapFile is an implementation of a DbFile that stores a collection of tuples
 * in no particular order. Tuples are stored on pages, each of which is a fixed
 * size, and the file is simply a collection of those pages. HeapFile works
 * closely with HeapPage. The format of HeapPages is described in the HeapPage
 * constructor.
 *
 * @author Sam Madden
 * @see HeapPage#HeapPage
 */
public class HeapFile implements DbFile {

    private final File file;
    private final TupleDesc tupleDesc;

    private static final class HeapFileIterator implements DbFileIterator {
        private final TransactionId tid;
        private final HeapFile heapFile;
        private Iterator<Tuple> tupleIterator;
        private int currentPageNo;

        public HeapFileIterator(TransactionId tid, HeapFile heapFile) {
            this.tid = tid;
            this.heapFile = heapFile;
        }

        @Override
        public void open() throws DbException, TransactionAbortedException {
            currentPageNo = 0;
            tupleIterator = getTupleIterator(currentPageNo);
        }

        private Iterator<Tuple> getTupleIterator(int pageNo) throws TransactionAbortedException, DbException {
            if (pageNo >= 0 && pageNo < heapFile.numPages()) {
                HeapPageId pid = new HeapPageId(heapFile.getId(), pageNo);
                HeapPage page = (HeapPage) Database.getBufferPool().getPage(tid, pid, Permissions.READ_ONLY);
                return page.iterator();
            } else {
                throw new DbException(String.format("heapFile %d doesn't exist in page[%d]", pageNo, heapFile.getId()));
            }
        }

        @Override
        public boolean hasNext() throws DbException, TransactionAbortedException {
            if (tupleIterator == null) {
                return false;
            }
            if (tupleIterator.hasNext()) {
                return true;
            }
            if (currentPageNo < heapFile.numPages() - 1) {
                currentPageNo++;
                tupleIterator = getTupleIterator(currentPageNo);
                return this.hasNext();
            }
            return false;
        }

        @Override
        public Tuple next() throws DbException, TransactionAbortedException {
            if (tupleIterator == null || !tupleIterator.hasNext()) {
                throw new NoSuchElementException();
            }
            return tupleIterator.next();
        }

        @Override
        public void rewind() throws DbException, TransactionAbortedException {
            close();
            open();
        }

        @Override
        public void close() {
            tupleIterator = null;
            currentPageNo = 0;
        }
    }

    /**
     * Constructs a heap file backed by the specified file.
     *
     * @param f the file that stores the on-disk backing store for this heap
     *          file.
     */
    public HeapFile(File f, TupleDesc td) {
        // TODO: some code goes here
        this.file = f;
        this.tupleDesc = td;
    }

    /**
     * Returns the File backing this HeapFile on disk.
     *
     * @return the File backing this HeapFile on disk.
     */
    public File getFile() {
        // TODO: some code goes here
        return file;
    }

    /**
     * Returns an ID uniquely identifying this HeapFile. Implementation note:
     * you will need to generate this tableid somewhere to ensure that each
     * HeapFile has a "unique id," and that you always return the same value for
     * a particular HeapFile. We suggest hashing the absolute file name of the
     * file underlying the heapfile, i.e. f.getAbsoluteFile().hashCode().
     *
     * @return an ID uniquely identifying this HeapFile.
     */
    public int getId() {
        // TODO: some code goes here
        return file.getAbsoluteFile().hashCode();
    }

    /**
     * Returns the TupleDesc of the table stored in this DbFile.
     *
     * @return TupleDesc of this DbFile.
     */
    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return this.tupleDesc;
    }

    // see DbFile.java for javadocs
    public Page readPage(PageId pid) {
        // TODO: some code goes here
        int tableId = pid.getTableId();
        int pageNo = pid.getPageNumber();
        int offset = pageNo * BufferPool.getPageSize();
        RandomAccessFile randomAccessFile = null;

        try {
            randomAccessFile = new RandomAccessFile(file, "r");
            if ((long) (pageNo + 1) * BufferPool.getPageSize() > randomAccessFile.length()) {
                randomAccessFile.close();
                throw new IllegalArgumentException("pageNo is out of file length");
            }
            byte[] data = new byte[BufferPool.getPageSize()];
            randomAccessFile.seek(offset);
            int read = randomAccessFile.read(data, 0, BufferPool.getPageSize());
            if (read != BufferPool.getPageSize()) {
                throw new IllegalArgumentException("read page failed");
            }
            HeapPageId id = new HeapPageId(tableId, pageNo);
            return new HeapPage(id, data);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (randomAccessFile != null) {
                    randomAccessFile.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        throw new IllegalArgumentException("read page failed");
    }

    // see DbFile.java for javadocs
    public void writePage(Page page) throws IOException {
        // TODO: some code goes here
        // not necessary for lab1
        PageId pageId = page.getId();
        int pageNo = pageId.getPageNumber();
        int offset = pageNo * BufferPool.getPageSize();
        byte[] pageData = page.getPageData();

        RandomAccessFile file = new RandomAccessFile(this.file, "rw");
        file.seek(offset);
        file.write(pageData);
        file.close();

        page.markDirty(false, null);
    }

    /**
     * Returns the number of pages in this HeapFile.
     */
    public int numPages() {
        // TODO: some code goes here
        return (int) Math.floor(getFile().length() * 1.0 / BufferPool.getPageSize());
    }

    // see DbFile.java for javadocs
    public List<Page> insertTuple(TransactionId tid, Tuple t)
            throws DbException, IOException, TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        ArrayList<Page> pages = new ArrayList<Page>();
        for (int i = 0; i < numPages(); i++) {
            HeapPage page = (HeapPage) Database.getBufferPool().getPage(tid, new HeapPageId(getId(), i), Permissions.READ_WRITE);
            if (page.getNumUnusedSlots() > 0) {
                page.insertTuple(t);
                pages.add(page);
                return pages;
            }
        }

        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(file, true));
        byte[] emptyData = HeapPage.createEmptyPageData();
        bw.write(emptyData);
        bw.close();
        HeapPage page = (HeapPage) Database.getBufferPool().getPage(tid, new HeapPageId(getId(), numPages() - 1), Permissions.READ_WRITE);
        page.insertTuple(t);
        pages.add(page);
        return pages;
    }

    // see DbFile.java for javadocs
    public List<Page> deleteTuple(TransactionId tid, Tuple t) throws DbException,
            TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        HeapPage page = (HeapPage) Database.getBufferPool().getPage(tid, t.getRecordId().getPageId(), Permissions.READ_WRITE);
        page.deleteTuple(t);
        return Collections.singletonList(page);
    }

    // see DbFile.java for javadocs
    public DbFileIterator iterator(TransactionId tid) {
        // TODO: some code goes here
        return new HeapFileIterator(tid, this);
    }

}

insertTuple方法主要是要找到一个未使用的槽位插入Tuple,如果没有则需要申请一个新的页面。deleteTuple方法在删除Tuple后使用了Collections.singletonList(page)方法创建了一个只包含一个元素的列表,这个元素就是刚刚修改过的页面。

BufferPool.java代码:

    /**
     * Add a tuple to the specified table on behalf of transaction tid.  Will
     * acquire a write lock on the page the tuple is added to and any other
     * pages that are updated (Lock acquisition is not needed for lab2).
     * May block if the lock(s) cannot be acquired.
     * <p>
     * Marks any pages that were dirtied by the operation as dirty by calling
     * their markDirty bit, and adds versions of any pages that have
     * been dirtied to the cache (replacing any existing versions of those pages) so
     * that future requests see up-to-date pages.
     *
     * @param tid     the transaction adding the tuple
     * @param tableId the table to add the tuple to
     * @param t       the tuple to add
     */
    public void insertTuple(TransactionId tid, int tableId, Tuple t)
            throws DbException, IOException, TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        DbFile dbFile = Database.getCatalog().getDatabaseFile(tableId);
        List<Page> pages = dbFile.insertTuple(tid, t);
        for (Page page : pages) {
            page.markDirty(true, tid);
            bufferPool.put(page.getId(), page);
        }        
    }

    /**
     * Remove the specified tuple from the buffer pool.
     * Will acquire a write lock on the page the tuple is removed from and any
     * other pages that are updated. May block if the lock(s) cannot be acquired.
     * <p>
     * Marks any pages that were dirtied by the operation as dirty by calling
     * their markDirty bit, and adds versions of any pages that have
     * been dirtied to the cache (replacing any existing versions of those pages) so
     * that future requests see up-to-date pages.
     *
     * @param tid the transaction deleting the tuple.
     * @param t   the tuple to delete
     */
    public void deleteTuple(TransactionId tid, Tuple t)
            throws DbException, IOException, TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        RecordId recordId = t.getRecordId();
        PageId pageId = recordId.getPageId();
        DbFile dbFile = Database.getCatalog().getDatabaseFile(pageId.getTableId());
        List<Page> pages = dbFile.deleteTuple(tid, t);
        for (Page page : pages) {
            page.markDirty(true, tid);
        }
    }

对Exercise 3执行单元测试:

ant runtest -Dtest=HeapPageWriteTest
ant runtest -Dtest=HeapFileWriteTest
ant runtest -Dtest=BufferPoolWriteTest

此时测试应该都是successful的。

Exercise 4

需要完成的是:

  • src/java/simpledb/execution/Insert.java
  • src/java/simpledb/execution/Delete.java

这两个操作符应该调用BufferPool来插入和删除元组。

Insert.java代码:

package simpledb.execution;

import simpledb.common.Database;
import simpledb.common.DbException;
import simpledb.common.Type;
import simpledb.storage.BufferPool;
import simpledb.storage.IntField;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;
import simpledb.transaction.TransactionId;

/**
 * Inserts tuples read from the child operator into the tableId specified in the
 * constructor
 */
public class Insert extends Operator {

    private static final long serialVersionUID = 1L;
    private TransactionId t;
    private OpIterator[] children;
    private int tableId;
    private TupleDesc td;
    private Tuple res;

    /**
     * Constructor.
     *
     * @param t       The transaction running the insert.
     * @param child   The child operator from which to read tuples to be inserted.
     * @param tableId The table in which to insert tuples.
     * @throws DbException if TupleDesc of child differs from table into which we are to
     *                     insert.
     */
    public Insert(TransactionId t, OpIterator child, int tableId)
            throws DbException {
        // TODO: some code goes here
        this.t = t;
        this.children = new OpIterator[]{child};
        this.tableId = tableId;
        this.td = new TupleDesc(new Type[]{Type.INT_TYPE}, new String[]{"insertNums"});
    }

    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return td;
    }

    public void open() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        super.open();
        children[0].open();
        res = null;
    }

    public void close() {
        // TODO: some code goes here
        super.close();
        children[0].close();
    }

    public void rewind() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        close();
        open();
    }

    /**
     * Inserts tuples read from child into the tableId specified by the
     * constructor. It returns a one field tuple containing the number of
     * inserted records. Inserts should be passed through BufferPool. An
     * instances of BufferPool is available via Database.getBufferPool(). Note
     * that insert DOES NOT need check to see if a particular tuple is a
     * duplicate before inserting it.
     *
     * @return A 1-field tuple containing the number of inserted records, or
     *         null if called more than once.
     * @see Database#getBufferPool
     * @see BufferPool#insertTuple
     */
    protected Tuple fetchNext() throws TransactionAbortedException, DbException {
        // TODO: some code goes here
        if (res != null) {
            return null;
        }
        int cnt = 0;
        while (children[0].hasNext()) {
            try {
                Database.getBufferPool().insertTuple(t, tableId, children[0].next());
                ++cnt;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        res = new Tuple(td);
        res.setField(0, new IntField(cnt));
        return res;
    }

    @Override
    public OpIterator[] getChildren() {
        // TODO: some code goes here
        return this.children;
    }

    @Override
    public void setChildren(OpIterator[] children) {
        // TODO: some code goes here
        this.children = children;
    }
}

主要在于fetchNext方法实现,注释说明不需要检查元组是否重复,因此将子操作符中的元组全部插入到BufferPool中,并返回一个元组,这个元组的字段包含的是插入元组的数量。

Delete.java代码:

package simpledb.execution;

import simpledb.common.Database;
import simpledb.common.DbException;
import simpledb.common.Type;
import simpledb.storage.BufferPool;
import simpledb.storage.IntField;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionAbortedException;
import simpledb.transaction.TransactionId;

import java.io.IOException;

/**
 * The delete operator. Delete reads tuples from its child operator and removes
 * them from the table they belong to.
 */
public class Delete extends Operator {

    private static final long serialVersionUID = 1L;
    private TransactionId t;
    private OpIterator[] children;
    private TupleDesc td;
    private Tuple res;

    /**
     * Constructor specifying the transaction that this delete belongs to as
     * well as the child to read from.
     *
     * @param t     The transaction this delete runs in
     * @param child The child operator from which to read tuples for deletion
     */
    public Delete(TransactionId t, OpIterator child) {
        // TODO: some code goes here
        this.t = t;
        this.children = new OpIterator[]{child};
        this.td = new TupleDesc(new Type[]{Type.INT_TYPE}, new String[]{"deleteNums"});
    }

    public TupleDesc getTupleDesc() {
        // TODO: some code goes here
        return td;
    }

    public void open() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        super.open();
        children[0].open();
    }

    public void close() {
        // TODO: some code goes here
        super.close();
        children[0].close();
    }

    public void rewind() throws DbException, TransactionAbortedException {
        // TODO: some code goes here
        close();
        open();
    }

    /**
     * Deletes tuples as they are read from the child operator. Deletes are
     * processed via the buffer pool (which can be accessed via the
     * Database.getBufferPool() method.
     *
     * @return A 1-field tuple containing the number of deleted records.
     * @see Database#getBufferPool
     * @see BufferPool#deleteTuple
     */
    protected Tuple fetchNext() throws TransactionAbortedException, DbException {
        // TODO: some code goes here
        if (res != null) {
            return null;
        }
        int cnt = 0;
        while (children[0].hasNext()) {
            Tuple tuple = children[0].next();
            try {
                Database.getBufferPool().deleteTuple(t, tuple);
                cnt++;
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        res = new Tuple(td);
        res.setField(0, new IntField(cnt));
        return res;
    }

    @Override
    public OpIterator[] getChildren() {
        // TODO: some code goes here
        return this.children;
    }

    @Override
    public void setChildren(OpIterator[] children) {
        // TODO: some code goes here
        this.children = children;
    }

}

和Insert类似,主要实现fetchNext方法。

对Exercise 4执行单元测试和系统测试:

ant runtest -Dtest=InsertTest
ant runsystest -Dtest=InsertTest
ant runsystest -Dtest=DeleteTest

此时测试应该都是successful的。

Exercise 5

需要完成的是:

  • src/java/simpledb/storage/BufferPool.java

为了完成这个Exercise要求的功能,对之前已经完成的部分也进行了修改,将bufferPool从ConcurrentHashMap改为了LinkedHashMap,原因是现在需要为BufferPool选择一个页面回收策略。

实验要求没有指定任何策略,但在实际的数据库系统中,页面的访问往往是不均匀的,有些页面可能会被频繁地访问,而有些页面可能很少被访问。如果一个页面很少被访问,那么即使它一直保留在内存中,也不会带来太大的好处。相反,如果一个页面被频繁地访问,那么将它保留在内存中可以大大提高系统的性能。因此,更好的页面淘汰策略通常会考虑页面的访问频率和访问模式。例如,最近最少使用(LRU)策略会淘汰最长时间没有被访问的页面,而最不经常使用(LFU)策略会淘汰访问频率最低的页面。

LRU策略是数据库比较常用的策略,因此将BufferPool改写用LRU策略实现。

BufferPool.java代码:

package simpledb.storage;

import simpledb.common.Database;
import simpledb.common.DbException;
import simpledb.common.DeadlockException;
import simpledb.common.Permissions;
import simpledb.transaction.TransactionAbortedException;
import simpledb.transaction.TransactionId;

import java.io.IOException;
import java.util.*;

/**
 * BufferPool manages the reading and writing of pages into memory from
 * disk. Access methods call into it to retrieve pages, and it fetches
 * pages from the appropriate location.
 * <p>
 * The BufferPool is also responsible for locking;  when a transaction fetches
 * a page, BufferPool checks that the transaction has the appropriate
 * locks to read/write the page.
 *
 * @Threadsafe, all fields are final
 */
public class BufferPool {
    /**
     * Bytes per page, including header.
     */
    private static final int DEFAULT_PAGE_SIZE = 4096;

    private static int pageSize = DEFAULT_PAGE_SIZE;

    /**
     * Default number of pages passed to the constructor. This is used by
     * other classes. BufferPool should use the numPages argument to the
     * constructor instead.
     */
    public static final int DEFAULT_PAGES = 50;

    private final int numPages;
    private final Map<PageId, Page> bufferPool;

    /**
     * Creates a BufferPool that caches up to numPages pages.
     *
     * @param numPages maximum number of pages in this buffer pool.
     */
    public BufferPool(int numPages) {
        // TODO: some code goes here
        this.numPages = numPages;
        this.bufferPool = new LinkedHashMap<PageId, Page>(numPages, 0.75f, true) {
            @Override
            protected boolean removeEldestEntry(Map.Entry<PageId, Page> eldest) {
                return size() > numPages;
            }
        };
    }

    public static int getPageSize() {
        return pageSize;
    }

    // THIS FUNCTION SHOULD ONLY BE USED FOR TESTING!!
    public static void setPageSize(int pageSize) {
        BufferPool.pageSize = pageSize;
    }

    // THIS FUNCTION SHOULD ONLY BE USED FOR TESTING!!
    public static void resetPageSize() {
        BufferPool.pageSize = DEFAULT_PAGE_SIZE;
    }

    /**
     * Retrieve the specified page with the associated permissions.
     * Will acquire a lock and may block if that lock is held by another
     * transaction.
     * <p>
     * The retrieved page should be looked up in the buffer pool.  If it
     * is present, it should be returned.  If it is not present, it should
     * be added to the buffer pool and returned.  If there is insufficient
     * space in the buffer pool, a page should be evicted and the new page
     * should be added in its place.
     *
     * @param tid  the ID of the transaction requesting the page
     * @param pid  the ID of the requested page
     * @param perm the requested permissions on the page
     */
    public Page getPage(TransactionId tid, PageId pid, Permissions perm)
            throws TransactionAbortedException, DbException {
        // TODO: some code goes here
        Page page = bufferPool.get(pid);
        if (page != null) {
            bufferPool.remove(pid);
            bufferPool.put(pid, page);
            return page;
        }
        if (bufferPool.size() >= numPages) {
            evictPage();
        }
        DbFile dbFile = Database.getCatalog().getDatabaseFile(pid.getTableId());
        page = dbFile.readPage(pid);
        bufferPool.put(pid, page);
        return page;
    }

    /**
     * Releases the lock on a page.
     * Calling this is very risky, and may result in wrong behavior. Think hard
     * about who needs to call this and why, and why they can run the risk of
     * calling it.
     *
     * @param tid the ID of the transaction requesting the unlock
     * @param pid the ID of the page to unlock
     */
    public void unsafeReleasePage(TransactionId tid, PageId pid) {
        // TODO: some code goes here
        // not necessary for lab1|lab2
    }

    /**
     * Release all locks associated with a given transaction.
     *
     * @param tid the ID of the transaction requesting the unlock
     */
    public void transactionComplete(TransactionId tid) {
        // TODO: some code goes here
        // not necessary for lab1|lab2
    }

    /**
     * Return true if the specified transaction has a lock on the specified page
     */
    public boolean holdsLock(TransactionId tid, PageId p) {
        // TODO: some code goes here
        // not necessary for lab1|lab2
        return false;
    }

    /**
     * Commit or abort a given transaction; release all locks associated to
     * the transaction.
     *
     * @param tid    the ID of the transaction requesting the unlock
     * @param commit a flag indicating whether we should commit or abort
     */
    public void transactionComplete(TransactionId tid, boolean commit) {
        // TODO: some code goes here
        // not necessary for lab1|lab2
    }

    /**
     * Add a tuple to the specified table on behalf of transaction tid.  Will
     * acquire a write lock on the page the tuple is added to and any other
     * pages that are updated (Lock acquisition is not needed for lab2).
     * May block if the lock(s) cannot be acquired.
     * <p>
     * Marks any pages that were dirtied by the operation as dirty by calling
     * their markDirty bit, and adds versions of any pages that have
     * been dirtied to the cache (replacing any existing versions of those pages) so
     * that future requests see up-to-date pages.
     *
     * @param tid     the transaction adding the tuple
     * @param tableId the table to add the tuple to
     * @param t       the tuple to add
     */
    public void insertTuple(TransactionId tid, int tableId, Tuple t)
            throws DbException, IOException, TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        DbFile dbFile = Database.getCatalog().getDatabaseFile(tableId);
        List<Page> pages = dbFile.insertTuple(tid, t);
        for (Page page : pages) {
            page.markDirty(true, tid);
            if (!bufferPool.containsKey(page.getId())) {
                if (bufferPool.size() >= numPages) {
                    evictPage();
                }
                bufferPool.put(page.getId(), page);
            }
        }        
    }

    /**
     * Remove the specified tuple from the buffer pool.
     * Will acquire a write lock on the page the tuple is removed from and any
     * other pages that are updated. May block if the lock(s) cannot be acquired.
     * <p>
     * Marks any pages that were dirtied by the operation as dirty by calling
     * their markDirty bit, and adds versions of any pages that have
     * been dirtied to the cache (replacing any existing versions of those pages) so
     * that future requests see up-to-date pages.
     *
     * @param tid the transaction deleting the tuple.
     * @param t   the tuple to delete
     */
    public void deleteTuple(TransactionId tid, Tuple t)
            throws DbException, IOException, TransactionAbortedException {
        // TODO: some code goes here
        // not necessary for lab1
        RecordId recordId = t.getRecordId();
        PageId pageId = recordId.getPageId();
        DbFile dbFile = Database.getCatalog().getDatabaseFile(pageId.getTableId());
        List<Page> pages = dbFile.deleteTuple(tid, t);
        for (Page page : pages) {
            page.markDirty(true, tid);
            if (!bufferPool.containsKey(page.getId())) {
                if (bufferPool.size() >= numPages) {
                    evictPage();
                }
                bufferPool.put(page.getId(), page);
            }
        }
    }

    /**
     * Flush all dirty pages to disk.
     * NB: Be careful using this routine -- it writes dirty data to disk so will
     * break simpledb if running in NO STEAL mode.
     */
    public synchronized void flushAllPages() throws IOException {
        // TODO: some code goes here
        // not necessary for lab1
        Set<PageId> pids = new HashSet<>(bufferPool.keySet());
        for (PageId pid : pids) {
            flushPage(pid);
        }
    }

    /**
     * Remove the specific page id from the buffer pool.
     * Needed by the recovery manager to ensure that the
     * buffer pool doesn't keep a rolled back page in its
     * cache.
     * <p>
     * Also used by B+ tree files to ensure that deleted pages
     * are removed from the cache so they can be reused safely
     */
    public synchronized void removePage(PageId pid) {
        // TODO: some code goes here
        // not necessary for lab1
        bufferPool.remove(pid);
    }

    /**
     * Flushes a certain page to disk
     *
     * @param pid an ID indicating the page to flush
     */
    private synchronized void flushPage(PageId pid) throws IOException {
        // TODO: some code goes here
        // not necessary for lab1
        Page page = bufferPool.get(pid);
        if (page == null) {
            throw new IOException("Page not found in buffer pool");
        }
        if (page.isDirty() != null) {
            Database.getCatalog().getDatabaseFile(pid.getTableId()).writePage(page);
            page.markDirty(false, null);
        }
    }

    /**
     * Write all pages of the specified transaction to disk.
     */
    public synchronized void flushPages(TransactionId tid) throws IOException {
        // TODO: some code goes here
        // not necessary for lab1|lab2
    }

    /**
     * Discards a page from the buffer pool.
     * Flushes the page to disk to ensure dirty pages are updated on disk.
     */
    private synchronized void evictPage() throws DbException {
        // TODO: some code goes here
        // not necessary for lab1
        if (!bufferPool.isEmpty()) {
            Iterator<PageId> iterator = bufferPool.keySet().iterator();
            if (iterator.hasNext()) {
                PageId pid = iterator.next();
                try {
                    flushPage(pid);
                } catch (IOException e) {
                    throw new DbException("Failed to flush page to disk");
                }
                bufferPool.remove(pid);
            }
        }
    }

}

这里将bufferPool的实现改为了LinkedHashMap,因此Exercise 3中实现的insertTuple和deleteTuple都进行了对应的修改,在插入和删除Tuple后将所在的页面标记为脏页,如果这一页不在bufferPool中时要将其换入。

  • evictPage方法负责刷新第一个页面(也就是最长未访问的页面)并驱逐,留出空间给要换入的页面。
  • getPage方法首先从bufferPool中取页,如果取到了就按照LRU的规则将其放到链表最后,如果没取到则读入bufferPool,并且要注意页数满的情况。
  • removePage方法不刷新page直接将其驱逐。
  • flushPage方法将脏页写回,但不会将其驱逐。
  • flushAllPages方法不使用ConcurrentHashMap之后bufferPool不再是线程安全的,所以为了迭代过程不受影响,先复制了一遍bufferPool.keySet()再进行遍历(这样做可以通过测试但是不知道会不会埋下雷)。

对Exercise 5执行系统测试:

ant runsystest -Dtest=EvictionTest

此时测试应该都是successful的。

A simple test

src/java/simpledb/目录下新建jointest.java文件:

package simpledb;

import java.io.*;

import simpledb.common.Database;
import simpledb.common.Type;
import simpledb.execution.Filter;
import simpledb.execution.Join;
import simpledb.execution.JoinPredicate;
import simpledb.execution.Predicate;
import simpledb.execution.SeqScan;
import simpledb.storage.HeapFile;
import simpledb.storage.IntField;
import simpledb.storage.Tuple;
import simpledb.storage.TupleDesc;
import simpledb.transaction.TransactionId;

public class jointest {

    public static void main(String[] argv) {
        // construct a 3-column table schema
        Type types[] = new Type[]{Type.INT_TYPE, Type.INT_TYPE, Type.INT_TYPE};
        String names[] = new String[]{"field0", "field1", "field2"};

        TupleDesc td = new TupleDesc(types, names);

        // create the tables, associate them with the data files
        // and tell the catalog about the schema  the tables.
        HeapFile table1 = new HeapFile(new File("some_data_file1.dat"), td);
        Database.getCatalog().addTable(table1, "t1");

        HeapFile table2 = new HeapFile(new File("some_data_file2.dat"), td);
        Database.getCatalog().addTable(table2, "t2");

        // construct the query: we use two SeqScans, which spoonfeed
        // tuples via iterators into join
        TransactionId tid = new TransactionId();

        SeqScan ss1 = new SeqScan(tid, table1.getId(), "t1");
        SeqScan ss2 = new SeqScan(tid, table2.getId(), "t2");

        // create a filter for the where condition
        Filter sf1 = new Filter(
                new Predicate(0,
                        Predicate.Op.GREATER_THAN, new IntField(1)), ss1);

        JoinPredicate p = new JoinPredicate(1, Predicate.Op.EQUALS, 1);
        Join j = new Join(p, sf1, ss2);

        // and run it
        try {
            j.open();
            while (j.hasNext()) {
                Tuple tup = j.next();
                System.out.println(tup);
            }
            j.close();
            Database.getBufferPool().transactionComplete(tid);

        } catch (Exception e) {
            e.printStackTrace();
        }

    }

}

这段程序相当于SQL语句:

SELECT *
FROM some_data_file1,
     some_data_file2
WHERE some_data_file1.field1 = some_data_file2.field1
  AND some_data_file1.id > 1

在项目顶层目录下准备两个数据文件some_data_file1.txtsome_data_file2.txt,内容为3列整数即可,例如:

1,10,100
2,20,200
3,30,300
4,40,400
5,50,500
5,50,600

先使用ant打包程序:

ant

如果没有报错,就将测试数据文件转换为数据库查询的二进制文件:

java -jar dist/simpledb.jar convert some_data_file1.txt 3
java -jar dist/simpledb.jar convert some_data_file2.txt 3

此时会生成两个dat文件,然后调用刚刚写的测试程序来验证:

java -classpath dist/simpledb.jar simpledb.jointest

Parser

实验代码已经提供了一个simpleDB的查询解析器,假设有一个测试文件data.txt,内容为:

1,10
2,20
3,30
4,40
5,50
5,50

先使用ant打包程序,然后将测试文件转换为simpleDB表:

java -jar dist/simpledb.jar convert data.txt 2 "int,int"

在创建目录文件catalog.txt,内容为:

data (f1 int, f2 int)

最后调用解析器:

java -jar dist/simpledb.jar parser catalog.txt
posted on 2024-04-22 16:50  未连接到互联网  阅读(50)  评论(0编辑  收藏  举报