diff --git a/doc/source/basic_usage.rst b/doc/source/basic_usage.rst index 3bd5eba..9a201c7 100644 --- a/doc/source/basic_usage.rst +++ b/doc/source/basic_usage.rst @@ -1,6 +1,34 @@ How To Use ============ +Introduction +--------------- + +The API for wrf-python can be summarized as a variable extraction/computation +routine, several interpolation routines, and some plotting utilities. +The API is kept as simple as possible to help minimize the +programming burden on new users, students, and scientists. In the future, we +plan to extend xarray for programmers desiring a more object oriented API, +but this remains a work in progress. + +The five most commonly used routines can be summarized as: + +- **wrf.getvar**: The routine that extracts WRF NetCDF variables or + computes diagnostic variables. This is the routine you will use most often. + +- **wrf.interplevel**: Interpolates a three-dimensional field to a horizontal + plane at a specified level using simple (fast) linear interpolation. + +- **wrf.vertcross**: Interpolates a three-dimensional field to a vertical plane + through a user-specified horizontal line (i.e. a cross section). + +- **wrf.interpline**: Interpolates a two-dimensional field to a user-specified + line. + +- **wrf.vinterp**: Interpolates a three-dimensional field to user-specified + 'surface' levels (e.g. theta-e levels). This is a smarter, but slower, + version of wrf.interplevel. + Basic Usage ---------------- @@ -1614,3 +1642,289 @@ Result: ] + +Using OpenMP +------------------------- + +Beginning in version 1.1, the Fortran computational routines in wrf-python make +use of OpenMP directives. OpenMP enables the calculations to use multiple CPU +cores, which can improve performance. In order to use OpenMP features, +wrf-python has to be compiled with OpenMP enabled (most pre-built binary +installations will have this enabled). + +The Fortran computational routines have all been built using runtime +scheduling, instead of compile time scheduling, so that the user can choose the +scheduler type within their Python application. By default, the scheduling +type is set to :data:`wrf.OMP_SCHED_STATIC` using only 1 CPU core, so +wrf-python will behave similarly to the non-OpenMP built versions. For the most +part, the difference between the scheduling types is minimal, with the exception +being the :data:`wrf.OMP_SCHED_DYNAMIC` scheduler that is much slower due to +the additional overhead associated with it. For new users, using the default +scheduler should be sufficient. + + +Verifying that OpenMP is Enabled +************************************* + +To take advantage of the performance improvements offered by OpenMP, wrf-python +needs to have been compiled with OpenMP features enabled. The example below +shows how you can determine if OpenMP is enabled in your build of wrf-python. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_enabled + + print(omp_enabled()) + + +Result: + +.. code-block:: none + + True + + +Determining the Number of Available Processors +*************************************************** + +The example below shows how you can get the maximum number of processors +that are available on your system. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_get_num_procs + + print(omp_get_num_procs()) + + +Result: + +.. code-block:: none + + 8 + + +Specifying the Number of Threads +************************************* + +To enable multicore support via OpenMP, specifying the maximum number +of OpenMP threads (i.e. CPU cores) is the only step that you need to take. + +In the example below, :meth:`wrf.omp_set_num_threads` is used to set the +maximum number of threads to use, and :meth:`wrf.omp_get_max_threads` is +get to get (and print) the maximum number of threads used. + +.. note:: + + Although there is an OpenMP library named :meth:`wrf.omp_get_num_threads`, + this method will always returns 1 when called from the sequential part of + the program. Use :meth:`wrf.omp_get_max_threads` to return the value set by + :meth:`wrf.omp_set_num_threads`. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_set_num_threads, omp_get_max_threads + + omp_set_num_threads(4) + + print (omp_get_max_threads()) + + +Result: + +.. code-block:: none + + 4 + +Setting a Different Scheduler Type +************************************** + +When an OpenMP directive is encountered in the Fortran code, a scheduler is +used to determine how the work is divided among the threads. All of the +Fortran routines are compiled to use a 'runtime' scheduler, which indicates +that the scheduler type (from the four listed below) is to be chosen at +runtime (i.e. inside a Python script) + +By default, the scheduler chosen is the :data:`wrf.OMP_SCHED_STATIC` scheduler, +which should be sufficient for most users. However, OpenMP and wrf-python +include the following options for the scheduler type: + +- :data:`wrf.OMP_SCHED_STATIC` +- :data:`wrf.OMP_SCHED_DYNAMIC` +- :data:`wrf.OMP_SCHED_GUIDED` +- :data:`wrf.OMP_SCHED_AUTO` + +Refer to the +`OpenMP Specification `_. +for more information about these scheduler types. In local testing, +:data:`wrf.OMP_SCHED_GUIDED` produced the best results, but +differences between :data:`wrf.OMP_SCHED_STATIC`, +:data:`wrf.OMP_SCHED_GUIDED`, and +:data:`wrf.OMP_SCHED_AUTO` were minor. However, +:data:`wrf.OMP_SCHED_DYNAMIC` produced noticeably slower results +due to the overhead of using a dynamic scheduler. + +When setting a scheduler type, the :meth:`wrf.omp_set_schedule` takes two +arguments. The first is the scheduler type (one from the list above), and the +second optional argument is a modifier, which is usually referred as the chunk +size. If the modifier/chunk_size is set to 0, then the OpenMP default +implementation is used. For :data:`wrf.OMP_SCHED_AUTO`, the +modifier is ignored. + +If you are new to OpenMP and all this sounds confusing, don't worry about +setting a scheduler type. The default static scheduler will be good enough. + +In the example below, the scheduler type is set to +:data:`wrf.OMP_SCHED_GUIDED` and uses the default chunk size of 0. The +scheduler type is then read back using :meth:`wrf.omp_get_schedule` +and printed. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_set_schedule, omp_get_schedule, OMP_SCHED_GUIDED + + omp_set_schedule(OMP_SCHED_GUIDED, 0) + + sched, modifier = omp_get_schedule() + + print(sched, modifier) + + +Result: + +.. code-block:: none + + 3 1 + +Notice that the printed scheduler type (*sched* variable) is set to a +value of 3, which is the actual integer constant value for the +:data:`wrf.OMP_SCHED_GUIDED` scheduler type. The *modifier* is returned as a +value of 1, which is different than the 0 that was supplied to the +:meth:`wrf.omp_set_schedule` routine. This is because the 0 tells OpenMP to use +its own default value for the scheduler, which is 1 for this type of scheduler. + + +Performance Note +****************** + +If you have enabled multicore support with OpenMP, you may have noticed that +the routines do not scale linearly with the number of CPU cores added. One main +reason is that the computational routines are already fairly efficient and +vectorize well, so for many grid sizes, the time it takes to extract the +variables is on par with the time required to compute the diagnostic with a +single CPU core. Adding more CPU cores will decrease the time needed to do the +computation, but total performance will still be limited by the time it takes +to extract the variables from the NetCDF file. For local testing, diminishing +returns were seen after 4 CPU cores, but this will largely depend on the +hardware used and grid size for your WRF run. + + +Performance Tips +-------------------- + +Memory Issues and :data:`wrf.ALL_TIMES` +****************************************** + +The use of :data:`wrf.ALL_TIMES` for the timeidx to :meth:`wrf.getvar` is +convenient for computing diagnostic variables across multiple files/times, but +there is something that users should be aware of. When :data:`wrf.ALL_TIMES` is +set as the *timeidx* argument, all arrays used in the computation are extracted +for all times before the computation is started. This can cause serious memory +issues on smaller hardware systems like laptops. + +In this example, the user wants to use a data set that is 289 x 39 x 300 x 300 +and compute z for the entire data set. The user is using a laptop with +16 GB of memory. + +.. code:: python + + from netCDF4 import Dataset + from wrf import getvar, ALL_TIMES + + file_list = [Dataset("/path/to/file1"), Dataset("/path/to/file2"),...] + z = getvar(file_list, "z", ALL_TIMES) + +Five hours later, the computation finished. What happened? + +In wrf-python, all of the computational routines use 8-byte REAL variables so +that both the 4-byte and 8-byte version of WRF output can be used. The +calculation for z extracts three variables (P, PHB, and HGT) and returns a +fourth array (RESULT). The RESULT will get cut in half to 4-byte REALs +after the computation, but need 8-byte REAL when the result is computed. + +Let's look at the approximate amount memory needed: + +P: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) +PHB: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) +HGT: 289 x 300 x 300 x 8 = 208,080,000 (~208 MB) +RESULT: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) + +Yikes! So, in order to do this calculation using :data:`wrf.ALL_TIMES` as +the timeidx, over 24.2 GB are needed for this one calculation. When the laptop +runs out of memory, it begins using the hard drive for swap memory, which runs +hundreds of times slower than real memory. + +To fix this situation, it is better to allocate the output array yourself and +run the calculation for each time step in a loop. The required memory +requirements change to: + +(Only need to store the result in a 4-byte REAL) +FINAL_RESULT: 289 x 39 x 300 x 300 x 4 = 4,057560,000 bytes (~4 GB) + +(The numbers below are for each loop iteration) +P: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) +PHB: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) +HGT: 300 x 300 x 8 = 720,000 bytes (720 KB) +RESULT: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) + +Since the memory for the computation is deleted after each +loop iteration, the total memory usage drops to approximately 4.1 GB. + +The moral of the story is that you need to make sure that your system has +enough memory to extract everything it needs up front if you want to use +:data:`wrf.ALL_TIMES`, otherwise it is better to "loop-and-fill" yourself. + +Here is an example of the loop-and-fill technique: + +.. code:: python + + from __future__ import print_function, division + + import numpy as np + from netCDF4 import Dataset + from wrf import getvar, ALL_TIMES + + filename_list = ["/path/to/file1", "/path/to/file2",...] + + # Result shape (hardcoded for this example) + result_shape = (289, 39, 300, 300) + + # Only need 4-byte floats + z_final = np.empty(result_shape, np.float32) + + # Modify this number if using more than 1 time per file + times_per_file = 1 + + for timeidx in xrange(result_shape[0]): + # Compute the file index and the time index inside the file + fileidx = timeidx // times_per_file + file_timeidx = timeidx % times_per_file + + f = Dataset(filename_list[fileidx]) + z = getvar(f, "z", file_timeidx) + + z_final[timeidx,:] = z[:] + f.close() + + + + + diff --git a/doc/source/index.rst b/doc/source/index.rst index c0ae7c1..e0ca1e0 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -46,7 +46,7 @@ Documentation ./faq ./support ./license - ./workshop + ./tutorial_03_2018 Indices and tables diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst new file mode 100644 index 0000000..29b68b5 --- /dev/null +++ b/doc/source/tutorial.rst @@ -0,0 +1,25 @@ +Tutorials +============= + +NCAR occasionally provides tutorials for wrf-python at various times +throughout the year. + +Below are the links to the upcoming and past tutorials. + +Upcoming Tutorials +--------------------- + +.. toctree:: + :maxdepth: 1 + + tutorials/tutorial_03_2018.rst + + +Past Tutorials +------------------ + +.. toctree:: + :maxdepth: 1 + + tutorials/wrf_workshop_2017.rst + diff --git a/doc/source/tutorials/tutorial_03_2018.rst b/doc/source/tutorials/tutorial_03_2018.rst new file mode 100644 index 0000000..3f5ec39 --- /dev/null +++ b/doc/source/tutorials/tutorial_03_2018.rst @@ -0,0 +1,76 @@ +WRF Tutorial 2018 +===================== + +NCAR will be providing a four hour tutorial for wrf-python on Wednesday, March +7, 2018. The tutorial is free, but seating is limited to only 16 students, so +registration is required. + +The tutorial will take place at NCAR's corporate training center in Boulder, +Colorado. + +`Corporate Technical Training Center `_ +3085 Center Green Drive, Building CG-2, Room #3024 +Boulder, Colorado + +Overview +-------------- + +This tutorial provides an introduction to wrf-python. The tutorial is friendly +for new users of wrf-python, but this is not an introduction to the Python +programming language (see Prerequisites below). Due to limited seating, if you +do not have any previous experience with Python, please do not register +for this tutorial. + +Students are encouraged to bring their own data sets, but data will be provided +if this is not an option. Students will be provided a jupyter notebook workbook +which can be modified to accommodate their data. + +Topics include: + +- How to install wrf-python via conda +- A brief introduction to jupyter notebook +- Overview of WRF data files +- WRF-Python basics +- Plotting with cartopy +- Overview of OpenMP features and other performance tips +- Open lab for students + + +Registration +--------------- + +The registration form is here: + +`Registration Form `_ + +Registration consists of a brief survey, which will help give the instructors +a brief overview of your background and will help tailor the tutorial to +your expectations. + + +Prerequisites +--------------- + +This tutorial assumes that you have basic knowledge of how to type commands +in to a command terminal using your preferred operating system. You +should know some basic directory commands like *cd*, *mkdir*, *cp*, *mv*. + +This tutorial assumes that you have prior experience programming in Python. +Below is a list of some Python concepts that you will see in the examples, +but don't worry if you aren't familiar with everything. + +- Opening a Python interpreter and entering commands. +- Importing packages via the import statement. +- Familiarity with some of the basic Python types: str, list, tuple, dict, bool, float, int, None. +- Creating a list, tuple, or dict with "[ ]", "( )", "{ }" syntax (e.g. my_list = [1,2,3,4,5]). +- Accessing dict/list/tuple items with the "x[ ]" syntax (e.g. my_list_item = my_list[0]). +- Slicing str/list/tuple with the ":" syntax (e.g. my_slice = my_list[1:3]). +- Using object methods and attributes with the "x.y" syntax (e.g. my_list.append(6)). +- Calling functions (e.g. result = some_function(x, y)) +- Familiarity with numpy would be helpful, as only a very brief introduction + is provided. +- Familiarity with matplotlib would be helpful, as only a very brief + introduction is provided. + + + diff --git a/doc/source/workshop.rst b/doc/source/tutorials/wrf_workshop_2017.rst similarity index 100% rename from doc/source/workshop.rst rename to doc/source/tutorials/wrf_workshop_2017.rst diff --git a/doc/source/user_api/index.rst b/doc/source/user_api/index.rst index 17b72e0..587d429 100644 --- a/doc/source/user_api/index.rst +++ b/doc/source/user_api/index.rst @@ -74,7 +74,6 @@ the array object to a compiled extension. :toctree: ./generated/ wrf.to_np - Variable Extraction Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -110,7 +109,7 @@ The routines below are used to assist with plotting. wrf.get_pyngl wrf.cartopy_xlim wrf.cartopy_ylim - + Raw Diagnostic Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -154,6 +153,82 @@ sure they are removed before calling these routines. wrf.omega wrf.pw +OpenMP Runtime Library Routines +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The routines below are the OpenMP runtime libraries that have been wrapped +for wrf-python. The entire library (OpenMP 3.x) has been wrapped, but many of +the routines are only useful inside of an OpenMP thread, so they aren't useful +from inside the Python interpreter. Also, the Fortran code in wrf-python is +fairly simple in terms of threading, so features like nested threads aren't +used. The documentation below is split in to the useful OpenMP functions and +the less useful functions. + +The documentation for each routine was taken directly from the +`OpenMP Specification `_. +Read the specification for more details about these routines. + +Useful OpenMP Routines +***************************** + +The routines below are useful when called from within a Python program. These +routines handle setting the number of threads, setting up the scheduler, +and timing. + +It is also important to note that the OpenMP directives within the Fortran +code all specify a runtime scheduler. This means that the user can control +the type of scheduling to use from within their Python application by using the +routines below. + +.. autosummary:: + :nosignatures: + :toctree: ./generated/ + + wrf.omp_enabled + wrf.omp_set_num_threads + wrf.omp_get_max_threads + wrf.omp_get_num_procs + wrf.omp_set_dynamic + wrf.omp_get_dynamic + wrf.omp_set_schedule + wrf.omp_get_schedule + wrf.omp_get_thread_limit + wrf.omp_get_wtime + wrf.omp_get_wtick + +Less Useful OpenMP Routines +******************************* + +The routines below are less useful because wrf-python does not use nested +parallelism and some of the routines are only applicable when called from +within an OpenMP thread. + +.. autosummary:: + :nosignatures: + :toctree: ./generated/ + + wrf.omp_get_num_threads + wrf.omp_get_thread_num + wrf.omp_in_parallel + wrf.omp_set_nested + wrf.omp_get_nested + wrf.omp_set_max_active_levels + wrf.omp_get_max_active_levels + wrf.omp_get_level + wrf.omp_get_ancestor_thread_num + wrf.omp_get_team_size + wrf.omp_get_active_level + wrf.omp_in_final + wrf.omp_init_lock + wrf.omp_init_nest_lock + wrf.omp_destroy_lock + wrf.omp_destroy_nest_lock + wrf.omp_set_lock + wrf.omp_set_nest_lock + wrf.omp_unset_lock + wrf.omp_unset_nest_lock + wrf.omp_test_lock + wrf.omp_test_nest_lock Configuration Routines ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/wrf/api.py b/src/wrf/api.py index c774505..660c9d6 100644 --- a/src/wrf/api.py +++ b/src/wrf/api.py @@ -4,7 +4,9 @@ from .config import (xarray_enabled, disable_xarray, enable_xarray, pyngl_enabled, enable_pyngl, disable_pyngl, set_cache_size, get_cache_size, omp_enabled) from .constants import (ALL_TIMES, Constants, ConversionFactors, - ProjectionTypes, default_fill) + ProjectionTypes, default_fill, + OMP_SCHED_STATIC, OMP_SCHED_DYNAMIC, + OMP_SCHED_GUIDED, OMP_SCHED_AUTO) from .destag import destagger from .routines import getvar from .computation import (xy, interp1d, interp2dxy, interpz3d, slp, tk, td, rh, @@ -60,7 +62,8 @@ __all__ += ["xarray_enabled", "disable_xarray", "enable_xarray", "pyngl_enabled", "enable_pyngl", "disable_pyngl", "set_cache_size", "get_cache_size", "omp_enabled"] __all__ += ["ALL_TIMES", "Constants", "ConversionFactors", "ProjectionTypes", - "default_fill"] + "default_fill", "OMP_SCHED_STATIC", "OMP_SCHED_DYNAMIC", + "OMP_SCHED_GUIDED", "OMP_SCHED_AUTO"] __all__ += ["destagger"] __all__ += ["getvar"] __all__ += ["xy", "interp1d", "interp2dxy", "interpz3d", "slp", "tk", "td", diff --git a/src/wrf/config.py b/src/wrf/config.py index adf9705..d22d5de 100644 --- a/src/wrf/config.py +++ b/src/wrf/config.py @@ -4,7 +4,9 @@ from __future__ import (absolute_import, division, print_function, from threading import local import wrapt -from ._wrffortran import fomp_enabled +from ._wrffortran import (fomp_enabled, fomp_set_num_threads, + fomp_set_schedule, fomp_set_dynamic, + omp_constants) _local_config = local() @@ -214,3 +216,10 @@ def omp_enabled(): return True if fomp_enabled() else False + +# Set OpenMP to use 1 thread, static scheduler, and no dynamic +# Note: Using the raw extension functions here to prevent possible +# circular import problems in the future. +fomp_set_num_threads(1) +fomp_set_schedule(omp_constants.fomp_sched_static, 0) +fomp_set_dynamic(False) diff --git a/src/wrf/constants.py b/src/wrf/constants.py index ab6da60..6a6dc90 100755 --- a/src/wrf/constants.py +++ b/src/wrf/constants.py @@ -17,10 +17,10 @@ class Constants(object): for key,val in viewitems(wrf_constants.__dict__): setattr(Constants, key.upper(), np.asscalar(val)) -setattr(Constants, "OMP_SCHED_STATIC", omp_constants.fomp_sched_static) -setattr(Constants, "OMP_SCHED_DYNAMIC", omp_constants.fomp_sched_dynamic) -setattr(Constants, "OMP_SCHED_GUIDED", omp_constants.fomp_sched_guided) -setattr(Constants, "OMP_SCHED_AUTO", omp_constants.fomp_sched_auto) +OMP_SCHED_STATIC = omp_constants.fomp_sched_static +OMP_SCHED_DYNAMIC = omp_constants.fomp_sched_dynamic +OMP_SCHED_GUIDED = omp_constants.fomp_sched_guided +OMP_SCHED_AUTO = omp_constants.fomp_sched_auto class ConversionFactors(object): diff --git a/src/wrf/extension.py b/src/wrf/extension.py index 669d4f2..5747f64 100755 --- a/src/wrf/extension.py +++ b/src/wrf/extension.py @@ -1159,7 +1159,7 @@ def omp_get_nested(): return fomp_get_nested() -def omp_set_schedule(kind, modifier): +def omp_set_schedule(kind, modifier=0): """Set the schedule that is applied when *runtime* is used as schedule kind, by setting the value of the run-sched-var ICV. @@ -1182,7 +1182,8 @@ def omp_set_schedule(kind, modifier): modifier(:obj:`int`): An implementation specific value, depending on the choice for *kind*. This parameter is alternatively named - chunk_size in some OpenMP documentation. + chunk_size in some OpenMP documentation. Default is 0, which + means the OpenMP implementation will use its default value. Returns: diff --git a/src/wrf/util.py b/src/wrf/util.py index e245456..480f4f8 100644 --- a/src/wrf/util.py +++ b/src/wrf/util.py @@ -1249,13 +1249,13 @@ def _build_data_array(wrfnc, varname, timeidx, is_moving_domain, is_multifile, data = data[np.newaxis] attrs = OrderedDict() - for key, val in viewitems(var.__dict__): + for dkey, val in viewitems(var.__dict__): # scipy.io adds these but don't want them - if key in ("data", "_shape", "_size", "_typecode", "_attributes", + if dkey in ("data", "_shape", "_size", "_typecode", "_attributes", "maskandscale", "dimensions"): continue - _key = key if isinstance(key, str) else key.decode() + _dkey = dkey if isinstance(dkey, str) else dkey.decode() if isstr(val): _val = val else: @@ -1264,7 +1264,7 @@ def _build_data_array(wrfnc, varname, timeidx, is_moving_domain, is_multifile, else: _val = val - attrs[_key] = _val + attrs[_dkey] = _val dimnames = var.dimensions[-data.ndim:]