From 13f96ee1e36a36505e621082e27e9abc3e8f4739 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Fri, 13 May 2016 16:03:00 -0700 Subject: [PATCH 1/6] fix doc layout --- R/pkg/R/stats.R | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index e92b9e3d84f1..d95d6cda166f 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,9 +19,7 @@ setOldClass("jobj") -#' crosstab -#' -#' Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' @@ -32,6 +30,7 @@ setOldClass("jobj") #' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no #' occurrences will have zero as their counts. #' +#' @title SparkDataFrame statistic functions #' @rdname statfunctions #' @name crosstab #' @export @@ -49,9 +48,7 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov -#' -#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. +#' cov - Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' #' @param x A SparkDataFrame #' @param col1 the name of the first column @@ -75,9 +72,7 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr -#' -#' Calculates the correlation of two columns of a SparkDataFrame. +#' corr - Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. #' @@ -106,9 +101,7 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) -#' freqItems -#' -#' Finding frequent items for columns, possibly with false positives. +#' freqItems - Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in #' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. #' @@ -134,9 +127,7 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), collect(dataFrame(sct)) }) -#' approxQuantile -#' -#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. +#' approxQuantile - Calculates the approximate quantiles of a numerical column of a SparkDataFrame. #' #' The result of this algorithm has the following deterministic bound: #' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to @@ -174,9 +165,8 @@ setMethod("approxQuantile", as.list(probabilities), relativeError) }) -#' sampleBy -#' -#' Returns a stratified sample without replacement based on the fraction given on each stratum. +#' sampleBy - Returns a stratified sample without replacement based on the fraction given on each +#' stratum. #' #' @param x A SparkDataFrame #' @param col column that defines strata From df0851ca8694b0c8ffdab557d3ee17beefa25a2e Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 20 Jun 2016 19:11:28 -0700 Subject: [PATCH 2/6] update as per feedback --- R/pkg/R/stats.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index d95d6cda166f..41d42f8df84c 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,7 +19,10 @@ setOldClass("jobj") -#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' @title SparkDataFrame statistic functions +#' crosstab - Computes a pair-wise frequency table of the given columns +#' +#' Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' @@ -30,7 +33,6 @@ setOldClass("jobj") #' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no #' occurrences will have zero as their counts. #' -#' @title SparkDataFrame statistic functions #' @rdname statfunctions #' @name crosstab #' @export @@ -48,14 +50,14 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov - Calculate the sample covariance of two numerical columns of a SparkDataFrame. +#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' #' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column #' @return the covariance of the two columns. #' -#' @rdname statfunctions +#' @rdname cov #' @name cov #' @export #' @examples @@ -72,7 +74,7 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr - Calculates the correlation of two columns of a SparkDataFrame. +#' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. #' @@ -83,7 +85,7 @@ setMethod("cov", #' only "pearson" is allowed now. #' @return The Pearson Correlation Coefficient as a Double. #' -#' @rdname statfunctions +#' @rdname corr #' @name corr #' @export #' @examples From 0ac89db706a0137e957aa2fac75082507613c8bf Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 20 Jun 2016 21:57:21 -0700 Subject: [PATCH 3/6] fix bug --- R/pkg/R/generics.R | 8 ++++---- R/pkg/R/stats.R | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f6b9276d86f3..a2ec647ebb5b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -430,19 +430,19 @@ setGeneric("coltypes<-", function(x, value) { standardGeneric("coltypes<-") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("cov", function(x, ...) {standardGeneric("cov") }) -#' @rdname statfunctions +#' @rdname corr #' @export setGeneric("corr", function(x, ...) {standardGeneric("corr") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 41d42f8df84c..92a45fea943f 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -20,6 +20,8 @@ setOldClass("jobj") #' @title SparkDataFrame statistic functions + +#' @description #' crosstab - Computes a pair-wise frequency table of the given columns #' #' Computes a pair-wise frequency table of the given columns. Also known as a contingency From 9104bbc2cdf0927aa796b6817cde59c728215557 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 20 Jun 2016 22:01:14 -0700 Subject: [PATCH 4/6] more fix --- R/pkg/R/stats.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 92a45fea943f..d590a3a03172 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -22,9 +22,7 @@ setOldClass("jobj") #' @title SparkDataFrame statistic functions #' @description -#' crosstab - Computes a pair-wise frequency table of the given columns -#' -#' Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' From 10a4dba683e06254b038ec48dbc700c8abcd9ce7 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 20 Jun 2016 22:35:13 -0700 Subject: [PATCH 5/6] fix one missed --- R/pkg/R/generics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index a2ec647ebb5b..fac8470b5a1e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -442,7 +442,7 @@ setGeneric("corr", function(x, ...) {standardGeneric("corr") }) #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname cov +#' @rdname covar_pop #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) From 3760d03289ef42aa60fc7d92c49cb32674bb0edb Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 20 Jun 2016 23:18:23 -0700 Subject: [PATCH 6/6] more bug fix --- R/pkg/R/stats.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index d590a3a03172..e40b1773d70e 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -103,6 +103,7 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) +#' @description #' freqItems - Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in #' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. @@ -129,8 +130,8 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), collect(dataFrame(sct)) }) +#' @description #' approxQuantile - Calculates the approximate quantiles of a numerical column of a SparkDataFrame. -#' #' The result of this algorithm has the following deterministic bound: #' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to #' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the @@ -167,6 +168,7 @@ setMethod("approxQuantile", as.list(probabilities), relativeError) }) +#' @description #' sampleBy - Returns a stratified sample without replacement based on the fraction given on each #' stratum. #'