@@ -7,11 +7,19 @@ from io import StringIO
77
88from libc.string cimport strchr
99
10+ import cython
11+
12+ from cpython cimport PyObject_Str, PyUnicode_Join
13+
1014from cpython.datetime cimport datetime, datetime_new, import_datetime
1115from cpython.version cimport PY_VERSION_HEX
1216import_datetime()
1317
1418import numpy as np
19+ cimport numpy as cnp
20+ from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT,
21+ PyArray_IterNew, flatiter, float64_t)
22+ cnp.import_array()
1523
1624# dateutil compat
1725from dateutil.tz import (tzoffset,
@@ -26,11 +34,16 @@ from pandas._config import get_option
2634
2735from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
2836from pandas._libs.tslibs.nattype import nat_strings, NaT
29- from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
37+ from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size
3038
3139cdef extern from " ../src/headers/portable.h" :
3240 int getdigit_ascii(char c, int default) nogil
3341
42+ cdef extern from " ../src/parser/tokenizer.h" :
43+ double xstrtod(const char * p, char ** q, char decimal, char sci, char tsep,
44+ int skip_trailing, int * error, int * maybe_int)
45+
46+
3447# ----------------------------------------------------------------------
3548# Constants
3649
@@ -302,20 +315,48 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
302315 return parsed, parsed, reso
303316
304317
305- cpdef bint _does_string_look_like_datetime(object date_string):
306- if date_string.startswith(' 0' ):
307- # Strings starting with 0 are more consistent with a
308- # date-like string than a number
309- return True
318+ cpdef bint _does_string_look_like_datetime(object py_string):
319+ """
320+ Checks whether given string is a datetime: it has to start with '0' or
321+ be greater than 1000.
310322
311- try :
312- if float (date_string) < 1000 :
313- return False
314- except ValueError :
315- pass
323+ Parameters
324+ ----------
325+ py_string: object
316326
317- if date_string in _not_datelike_strings:
318- return False
327+ Returns
328+ -------
329+ whether given string is a datetime
330+ """
331+ cdef:
332+ const char * buf
333+ char * endptr = NULL
334+ Py_ssize_t length = - 1
335+ double converted_date
336+ char first
337+ int error = 0
338+
339+ buf = get_c_string_buf_and_size(py_string, & length)
340+ if length >= 1 :
341+ first = buf[0 ]
342+ if first == b' 0' :
343+ # Strings starting with 0 are more consistent with a
344+ # date-like string than a number
345+ return True
346+ elif py_string in _not_datelike_strings:
347+ return False
348+ else :
349+ # xstrtod with such paramaters copies behavior of python `float`
350+ # cast; for example, " 35.e-1 " is valid string for this cast so,
351+ # for correctly xstrtod call necessary to pass these params:
352+ # b'.' - a dot is used as separator, b'e' - an exponential form of
353+ # a float number can be used, b'\0' - not to use a thousand
354+ # separator, 1 - skip extra spaces before and after,
355+ converted_date = xstrtod(buf, & endptr,
356+ b' .' , b' e' , b' \0' , 1 , & error, NULL )
357+ # if there were no errors and the whole line was parsed, then ...
358+ if error == 0 and endptr == buf + length:
359+ return converted_date >= 1000
319360
320361 return True
321362
@@ -857,3 +898,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse,
857898 return guessed_format
858899 else :
859900 return None
901+
902+
903+ @ cython.wraparound (False )
904+ @ cython.boundscheck (False )
905+ cdef inline object convert_to_unicode(object item,
906+ bint keep_trivial_numbers):
907+ """
908+ Convert `item` to str.
909+
910+ Parameters
911+ ----------
912+ item : object
913+ keep_trivial_numbers : bool
914+ if True, then conversion (to string from integer/float zero)
915+ is not performed
916+
917+ Returns
918+ -------
919+ str or int or float
920+ """
921+ cdef:
922+ float64_t float_item
923+
924+ if keep_trivial_numbers:
925+ if isinstance (item, int ):
926+ if < int > item == 0 :
927+ return item
928+ elif isinstance (item, float ):
929+ float_item = item
930+ if float_item == 0.0 or float_item != float_item:
931+ return item
932+
933+ if not isinstance (item, str ):
934+ item = PyObject_Str(item)
935+
936+ return item
937+
938+
939+ @ cython.wraparound (False )
940+ @ cython.boundscheck (False )
941+ def _concat_date_cols (tuple date_cols , bint keep_trivial_numbers = True ):
942+ """
943+ Concatenates elements from numpy arrays in `date_cols` into strings.
944+
945+ Parameters
946+ ----------
947+ date_cols : tuple of numpy arrays
948+ keep_trivial_numbers : bool, default True
949+ if True and len(date_cols) == 1, then
950+ conversion (to string from integer/float zero) is not performed
951+
952+ Returns
953+ -------
954+ arr_of_rows : ndarray (dtype=object)
955+
956+ Examples
957+ --------
958+ >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
959+ >>> times=np.array(['11:20', '10:45'], dtype=object)
960+ >>> result = _concat_date_cols((dates, times))
961+ >>> result
962+ array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
963+ """
964+ cdef:
965+ Py_ssize_t rows_count = 0 , col_count = len (date_cols)
966+ Py_ssize_t col_idx, row_idx
967+ list list_to_join
968+ cnp.ndarray[object ] iters
969+ object [::1 ] iters_view
970+ flatiter it
971+ cnp.ndarray[object ] result
972+ object [:] result_view
973+
974+ if col_count == 0 :
975+ return np.zeros(0 , dtype = object )
976+
977+ if not all (is_array(array) for array in date_cols):
978+ raise ValueError (" not all elements from date_cols are numpy arrays" )
979+
980+ rows_count = min (len (array) for array in date_cols)
981+ result = np.zeros(rows_count, dtype = object )
982+ result_view = result
983+
984+ if col_count == 1 :
985+ array = date_cols[0 ]
986+ it = < flatiter> PyArray_IterNew(array)
987+ for row_idx in range (rows_count):
988+ item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
989+ result_view[row_idx] = convert_to_unicode(item,
990+ keep_trivial_numbers)
991+ PyArray_ITER_NEXT(it)
992+ else :
993+ # create fixed size list - more effecient memory allocation
994+ list_to_join = [None ] * col_count
995+ iters = np.zeros(col_count, dtype = object )
996+
997+ # create memoryview of iters ndarray, that will contain some
998+ # flatiter's for each array in `date_cols` - more effecient indexing
999+ iters_view = iters
1000+ for col_idx, array in enumerate (date_cols):
1001+ iters_view[col_idx] = PyArray_IterNew(array)
1002+
1003+ # array elements that are on the same line are converted to one string
1004+ for row_idx in range (rows_count):
1005+ for col_idx, array in enumerate (date_cols):
1006+ # this cast is needed, because we did not find a way
1007+ # to efficiently store `flatiter` type objects in ndarray
1008+ it = < flatiter> iters_view[col_idx]
1009+ item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1010+ list_to_join[col_idx] = convert_to_unicode(item, False )
1011+ PyArray_ITER_NEXT(it)
1012+ result_view[row_idx] = PyUnicode_Join(' ' , list_to_join)
1013+
1014+ return result
0 commit comments