flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (FLINK-3919) Distributed Linear Algebra: row-based matrix
Date Wed, 01 Jun 2016 13:07:59 GMT

    [ https://issues.apache.org/jira/browse/FLINK-3919?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15310265#comment-15310265
] 

ASF GitHub Bot commented on FLINK-3919:
---------------------------------------

Github user chobeat commented on a diff in the pull request:

    https://github.com/apache/flink/pull/1996#discussion_r65356053
  
    --- Diff: flink-libraries/flink-ml/src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala
---
    @@ -0,0 +1,166 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.ml.math.distributed
    +
    +import breeze.linalg.{CSCMatrix => BreezeSparseMatrix, Matrix => BreezeMatrix,
Vector => BreezeVector}
    +import org.apache.flink.api.scala._
    +import org.apache.flink.ml.math.Breeze._
    +import org.apache.flink.ml.math.{Matrix => FlinkMatrix, _}
    +
    +/**
    +  * Distributed row-major matrix representation.
    +  * @param numRows Number of rows.
    +  * @param numCols Number of columns.
    +  */
    +class DistributedRowMatrix(val data: DataSet[IndexedRow],
    +                           val numRows: Int,
    +                           val numCols: Int )
    +    extends DistributedMatrix {
    +
    +
    +
    +  /**
    +    * Collects the data in the form of a sequence of coordinates associated with their
values.
    +    * @return
    +    */
    +  def toCOO: Seq[(Int, Int, Double)] = {
    +
    +    val localRows = data.collect()
    +
    +    for (IndexedRow(rowIndex, vector) <- localRows;
    +         (columnIndex, value) <- vector) yield (rowIndex, columnIndex, value)
    +  }
    +
    +  /**
    +    * Collects the data in the form of a SparseMatrix
    +    * @return
    +    */
    +  def toLocalSparseMatrix: SparseMatrix = {
    +    val localMatrix =
    +      SparseMatrix.fromCOO(this.numRows, this.numCols, this.toCOO)
    +    require(localMatrix.numRows == this.numRows)
    +    require(localMatrix.numCols == this.numCols)
    +    localMatrix
    +  }
    +
    +  //TODO: convert to dense representation on the distributed matrix and collect it afterward
    +  def toLocalDenseMatrix: DenseMatrix = this.toLocalSparseMatrix.toDenseMatrix
    +
    +  /**
    +    * Apply a high-order function to couple of rows
    +    * @param fun
    +    * @param other
    +    * @return
    +    */
    +  def byRowOperation(fun: (Vector, Vector) => Vector,
    +                     other: DistributedRowMatrix): DistributedRowMatrix = {
    +    val otherData = other.data
    +    require(this.numCols == other.numCols)
    +    require(this.numRows == other.numRows)
    +
    +    val result = this.data
    +      .fullOuterJoin(otherData)
    +      .where("rowIndex")
    +      .equalTo("rowIndex")(
    +          (left: IndexedRow, right: IndexedRow) => {
    +            val row1 = Option(left) match {
    +              case Some(row: IndexedRow) => row
    +              case None =>
    +                IndexedRow(
    +                    right.rowIndex,
    +                    SparseVector.fromCOO(right.values.size, List((0, 0.0))))
    +            }
    +            val row2 = Option(right) match {
    +              case Some(row: IndexedRow) => row
    +              case None =>
    +                IndexedRow(
    +                    left.rowIndex,
    +                    SparseVector.fromCOO(left.values.size, List((0, 0.0))))
    +            }
    +            IndexedRow(row1.rowIndex, fun(row1.values, row2.values))
    +          }
    +      )
    +    new DistributedRowMatrix(result, numRows, numCols)
    +  }
    +
    +  /**
    +    * Add the matrix to another matrix.
    +    * @param other
    +    * @return
    +    */
    +  def sum(other: DistributedRowMatrix): DistributedRowMatrix = {
    +    val sumFunction: (Vector, Vector) => Vector = (x: Vector, y: Vector) =>
    +      (x.asBreeze + y.asBreeze).fromBreeze
    +    this.byRowOperation(sumFunction, other)
    +  }
    +
    +  /**
    +    * Subtracts another matrix.
    +    * @param other
    +    * @return
    +    */
    +  def subtract(other: DistributedRowMatrix): DistributedRowMatrix = {
    +    val subFunction: (Vector, Vector) => Vector = (x: Vector, y: Vector) =>
    +      (x.asBreeze - y.asBreeze).fromBreeze
    +    this.byRowOperation(subFunction, other)
    +  }
    +}
    +
    +object DistributedRowMatrix {
    +
    +  type MatrixRowIndex = Int
    --- End diff --
    
    yup, I tried to stay general but I'm not sure I've been disciplined in using MatrixRowIndex
instead of Int so maybe I should remove it and leave to an eventual refactoring the generalization.


> Distributed Linear Algebra: row-based matrix
> --------------------------------------------
>
>                 Key: FLINK-3919
>                 URL: https://issues.apache.org/jira/browse/FLINK-3919
>             Project: Flink
>          Issue Type: New Feature
>          Components: Machine Learning Library
>            Reporter: Simone Robutti
>            Assignee: Simone Robutti
>
> Distributed matrix implementation as a DataSet of IndexedRow and related operations



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message