From 48f406c2638c22267c9c3c3b7e16d8568a4652cb Mon Sep 17 00:00:00 2001 From: Bill Ladwig Date: Tue, 9 Jan 2018 11:45:45 -0700 Subject: [PATCH] Fixed issue with coordinate cache key name conflict. Also added documentation for OpenMP and performance tips. The tutorial section has been moved to better support announcements in the future. Renamed the OpenMP scheduler constants so that they are placed in to the wrf namespace rather than wrf.Constants. --- doc/source/basic_usage.rst | 314 ++++++++++++++++++ doc/source/index.rst | 2 +- doc/source/tutorial.rst | 25 ++ doc/source/tutorials/tutorial_03_2018.rst | 76 +++++ .../wrf_workshop_2017.rst} | 0 doc/source/user_api/index.rst | 79 ++++- src/wrf/api.py | 7 +- src/wrf/config.py | 11 +- src/wrf/constants.py | 8 +- src/wrf/extension.py | 5 +- src/wrf/util.py | 8 +- 11 files changed, 519 insertions(+), 16 deletions(-) create mode 100644 doc/source/tutorial.rst create mode 100644 doc/source/tutorials/tutorial_03_2018.rst rename doc/source/{workshop.rst => tutorials/wrf_workshop_2017.rst} (100%) diff --git a/doc/source/basic_usage.rst b/doc/source/basic_usage.rst index 3bd5eba..9a201c7 100644 --- a/doc/source/basic_usage.rst +++ b/doc/source/basic_usage.rst @@ -1,6 +1,34 @@ How To Use ============ +Introduction +--------------- + +The API for wrf-python can be summarized as a variable extraction/computation +routine, several interpolation routines, and some plotting utilities. +The API is kept as simple as possible to help minimize the +programming burden on new users, students, and scientists. In the future, we +plan to extend xarray for programmers desiring a more object oriented API, +but this remains a work in progress. + +The five most commonly used routines can be summarized as: + +- **wrf.getvar**: The routine that extracts WRF NetCDF variables or + computes diagnostic variables. This is the routine you will use most often. + +- **wrf.interplevel**: Interpolates a three-dimensional field to a horizontal + plane at a specified level using simple (fast) linear interpolation. + +- **wrf.vertcross**: Interpolates a three-dimensional field to a vertical plane + through a user-specified horizontal line (i.e. a cross section). + +- **wrf.interpline**: Interpolates a two-dimensional field to a user-specified + line. + +- **wrf.vinterp**: Interpolates a three-dimensional field to user-specified + 'surface' levels (e.g. theta-e levels). This is a smarter, but slower, + version of wrf.interplevel. + Basic Usage ---------------- @@ -1614,3 +1642,289 @@ Result: ] + +Using OpenMP +------------------------- + +Beginning in version 1.1, the Fortran computational routines in wrf-python make +use of OpenMP directives. OpenMP enables the calculations to use multiple CPU +cores, which can improve performance. In order to use OpenMP features, +wrf-python has to be compiled with OpenMP enabled (most pre-built binary +installations will have this enabled). + +The Fortran computational routines have all been built using runtime +scheduling, instead of compile time scheduling, so that the user can choose the +scheduler type within their Python application. By default, the scheduling +type is set to :data:`wrf.OMP_SCHED_STATIC` using only 1 CPU core, so +wrf-python will behave similarly to the non-OpenMP built versions. For the most +part, the difference between the scheduling types is minimal, with the exception +being the :data:`wrf.OMP_SCHED_DYNAMIC` scheduler that is much slower due to +the additional overhead associated with it. For new users, using the default +scheduler should be sufficient. + + +Verifying that OpenMP is Enabled +************************************* + +To take advantage of the performance improvements offered by OpenMP, wrf-python +needs to have been compiled with OpenMP features enabled. The example below +shows how you can determine if OpenMP is enabled in your build of wrf-python. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_enabled + + print(omp_enabled()) + + +Result: + +.. code-block:: none + + True + + +Determining the Number of Available Processors +*************************************************** + +The example below shows how you can get the maximum number of processors +that are available on your system. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_get_num_procs + + print(omp_get_num_procs()) + + +Result: + +.. code-block:: none + + 8 + + +Specifying the Number of Threads +************************************* + +To enable multicore support via OpenMP, specifying the maximum number +of OpenMP threads (i.e. CPU cores) is the only step that you need to take. + +In the example below, :meth:`wrf.omp_set_num_threads` is used to set the +maximum number of threads to use, and :meth:`wrf.omp_get_max_threads` is +get to get (and print) the maximum number of threads used. + +.. note:: + + Although there is an OpenMP library named :meth:`wrf.omp_get_num_threads`, + this method will always returns 1 when called from the sequential part of + the program. Use :meth:`wrf.omp_get_max_threads` to return the value set by + :meth:`wrf.omp_set_num_threads`. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_set_num_threads, omp_get_max_threads + + omp_set_num_threads(4) + + print (omp_get_max_threads()) + + +Result: + +.. code-block:: none + + 4 + +Setting a Different Scheduler Type +************************************** + +When an OpenMP directive is encountered in the Fortran code, a scheduler is +used to determine how the work is divided among the threads. All of the +Fortran routines are compiled to use a 'runtime' scheduler, which indicates +that the scheduler type (from the four listed below) is to be chosen at +runtime (i.e. inside a Python script) + +By default, the scheduler chosen is the :data:`wrf.OMP_SCHED_STATIC` scheduler, +which should be sufficient for most users. However, OpenMP and wrf-python +include the following options for the scheduler type: + +- :data:`wrf.OMP_SCHED_STATIC` +- :data:`wrf.OMP_SCHED_DYNAMIC` +- :data:`wrf.OMP_SCHED_GUIDED` +- :data:`wrf.OMP_SCHED_AUTO` + +Refer to the +`OpenMP Specification `_. +for more information about these scheduler types. In local testing, +:data:`wrf.OMP_SCHED_GUIDED` produced the best results, but +differences between :data:`wrf.OMP_SCHED_STATIC`, +:data:`wrf.OMP_SCHED_GUIDED`, and +:data:`wrf.OMP_SCHED_AUTO` were minor. However, +:data:`wrf.OMP_SCHED_DYNAMIC` produced noticeably slower results +due to the overhead of using a dynamic scheduler. + +When setting a scheduler type, the :meth:`wrf.omp_set_schedule` takes two +arguments. The first is the scheduler type (one from the list above), and the +second optional argument is a modifier, which is usually referred as the chunk +size. If the modifier/chunk_size is set to 0, then the OpenMP default +implementation is used. For :data:`wrf.OMP_SCHED_AUTO`, the +modifier is ignored. + +If you are new to OpenMP and all this sounds confusing, don't worry about +setting a scheduler type. The default static scheduler will be good enough. + +In the example below, the scheduler type is set to +:data:`wrf.OMP_SCHED_GUIDED` and uses the default chunk size of 0. The +scheduler type is then read back using :meth:`wrf.omp_get_schedule` +and printed. + +.. code-block:: python + + from __future__ import print_function + + from wrf import omp_set_schedule, omp_get_schedule, OMP_SCHED_GUIDED + + omp_set_schedule(OMP_SCHED_GUIDED, 0) + + sched, modifier = omp_get_schedule() + + print(sched, modifier) + + +Result: + +.. code-block:: none + + 3 1 + +Notice that the printed scheduler type (*sched* variable) is set to a +value of 3, which is the actual integer constant value for the +:data:`wrf.OMP_SCHED_GUIDED` scheduler type. The *modifier* is returned as a +value of 1, which is different than the 0 that was supplied to the +:meth:`wrf.omp_set_schedule` routine. This is because the 0 tells OpenMP to use +its own default value for the scheduler, which is 1 for this type of scheduler. + + +Performance Note +****************** + +If you have enabled multicore support with OpenMP, you may have noticed that +the routines do not scale linearly with the number of CPU cores added. One main +reason is that the computational routines are already fairly efficient and +vectorize well, so for many grid sizes, the time it takes to extract the +variables is on par with the time required to compute the diagnostic with a +single CPU core. Adding more CPU cores will decrease the time needed to do the +computation, but total performance will still be limited by the time it takes +to extract the variables from the NetCDF file. For local testing, diminishing +returns were seen after 4 CPU cores, but this will largely depend on the +hardware used and grid size for your WRF run. + + +Performance Tips +-------------------- + +Memory Issues and :data:`wrf.ALL_TIMES` +****************************************** + +The use of :data:`wrf.ALL_TIMES` for the timeidx to :meth:`wrf.getvar` is +convenient for computing diagnostic variables across multiple files/times, but +there is something that users should be aware of. When :data:`wrf.ALL_TIMES` is +set as the *timeidx* argument, all arrays used in the computation are extracted +for all times before the computation is started. This can cause serious memory +issues on smaller hardware systems like laptops. + +In this example, the user wants to use a data set that is 289 x 39 x 300 x 300 +and compute z for the entire data set. The user is using a laptop with +16 GB of memory. + +.. code:: python + + from netCDF4 import Dataset + from wrf import getvar, ALL_TIMES + + file_list = [Dataset("/path/to/file1"), Dataset("/path/to/file2"),...] + z = getvar(file_list, "z", ALL_TIMES) + +Five hours later, the computation finished. What happened? + +In wrf-python, all of the computational routines use 8-byte REAL variables so +that both the 4-byte and 8-byte version of WRF output can be used. The +calculation for z extracts three variables (P, PHB, and HGT) and returns a +fourth array (RESULT). The RESULT will get cut in half to 4-byte REALs +after the computation, but need 8-byte REAL when the result is computed. + +Let's look at the approximate amount memory needed: + +P: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) +PHB: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) +HGT: 289 x 300 x 300 x 8 = 208,080,000 (~208 MB) +RESULT: 289 x 39 x 300 x 300 x 8 = 8,115,120,000 bytes (~8 GB!) + +Yikes! So, in order to do this calculation using :data:`wrf.ALL_TIMES` as +the timeidx, over 24.2 GB are needed for this one calculation. When the laptop +runs out of memory, it begins using the hard drive for swap memory, which runs +hundreds of times slower than real memory. + +To fix this situation, it is better to allocate the output array yourself and +run the calculation for each time step in a loop. The required memory +requirements change to: + +(Only need to store the result in a 4-byte REAL) +FINAL_RESULT: 289 x 39 x 300 x 300 x 4 = 4,057560,000 bytes (~4 GB) + +(The numbers below are for each loop iteration) +P: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) +PHB: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) +HGT: 300 x 300 x 8 = 720,000 bytes (720 KB) +RESULT: 39 x 300 x 300 x 8 = 28,080,000 bytes (~28 MB) + +Since the memory for the computation is deleted after each +loop iteration, the total memory usage drops to approximately 4.1 GB. + +The moral of the story is that you need to make sure that your system has +enough memory to extract everything it needs up front if you want to use +:data:`wrf.ALL_TIMES`, otherwise it is better to "loop-and-fill" yourself. + +Here is an example of the loop-and-fill technique: + +.. code:: python + + from __future__ import print_function, division + + import numpy as np + from netCDF4 import Dataset + from wrf import getvar, ALL_TIMES + + filename_list = ["/path/to/file1", "/path/to/file2",...] + + # Result shape (hardcoded for this example) + result_shape = (289, 39, 300, 300) + + # Only need 4-byte floats + z_final = np.empty(result_shape, np.float32) + + # Modify this number if using more than 1 time per file + times_per_file = 1 + + for timeidx in xrange(result_shape[0]): + # Compute the file index and the time index inside the file + fileidx = timeidx // times_per_file + file_timeidx = timeidx % times_per_file + + f = Dataset(filename_list[fileidx]) + z = getvar(f, "z", file_timeidx) + + z_final[timeidx,:] = z[:] + f.close() + + + + + diff --git a/doc/source/index.rst b/doc/source/index.rst index c0ae7c1..e0ca1e0 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -46,7 +46,7 @@ Documentation ./faq ./support ./license - ./workshop + ./tutorial_03_2018 Indices and tables diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst new file mode 100644 index 0000000..29b68b5 --- /dev/null +++ b/doc/source/tutorial.rst @@ -0,0 +1,25 @@ +Tutorials +============= + +NCAR occasionally provides tutorials for wrf-python at various times +throughout the year. + +Below are the links to the upcoming and past tutorials. + +Upcoming Tutorials +--------------------- + +.. toctree:: + :maxdepth: 1 + + tutorials/tutorial_03_2018.rst + + +Past Tutorials +------------------ + +.. toctree:: + :maxdepth: 1 + + tutorials/wrf_workshop_2017.rst + diff --git a/doc/source/tutorials/tutorial_03_2018.rst b/doc/source/tutorials/tutorial_03_2018.rst new file mode 100644 index 0000000..3f5ec39 --- /dev/null +++ b/doc/source/tutorials/tutorial_03_2018.rst @@ -0,0 +1,76 @@ +WRF Tutorial 2018 +===================== + +NCAR will be providing a four hour tutorial for wrf-python on Wednesday, March +7, 2018. The tutorial is free, but seating is limited to only 16 students, so +registration is required. + +The tutorial will take place at NCAR's corporate training center in Boulder, +Colorado. + +`Corporate Technical Training Center `_ +3085 Center Green Drive, Building CG-2, Room #3024 +Boulder, Colorado + +Overview +-------------- + +This tutorial provides an introduction to wrf-python. The tutorial is friendly +for new users of wrf-python, but this is not an introduction to the Python +programming language (see Prerequisites below). Due to limited seating, if you +do not have any previous experience with Python, please do not register +for this tutorial. + +Students are encouraged to bring their own data sets, but data will be provided +if this is not an option. Students will be provided a jupyter notebook workbook +which can be modified to accommodate their data. + +Topics include: + +- How to install wrf-python via conda +- A brief introduction to jupyter notebook +- Overview of WRF data files +- WRF-Python basics +- Plotting with cartopy +- Overview of OpenMP features and other performance tips +- Open lab for students + + +Registration +--------------- + +The registration form is here: + +`Registration Form `_ + +Registration consists of a brief survey, which will help give the instructors +a brief overview of your background and will help tailor the tutorial to +your expectations. + + +Prerequisites +--------------- + +This tutorial assumes that you have basic knowledge of how to type commands +in to a command terminal using your preferred operating system. You +should know some basic directory commands like *cd*, *mkdir*, *cp*, *mv*. + +This tutorial assumes that you have prior experience programming in Python. +Below is a list of some Python concepts that you will see in the examples, +but don't worry if you aren't familiar with everything. + +- Opening a Python interpreter and entering commands. +- Importing packages via the import statement. +- Familiarity with some of the basic Python types: str, list, tuple, dict, bool, float, int, None. +- Creating a list, tuple, or dict with "[ ]", "( )", "{ }" syntax (e.g. my_list = [1,2,3,4,5]). +- Accessing dict/list/tuple items with the "x[ ]" syntax (e.g. my_list_item = my_list[0]). +- Slicing str/list/tuple with the ":" syntax (e.g. my_slice = my_list[1:3]). +- Using object methods and attributes with the "x.y" syntax (e.g. my_list.append(6)). +- Calling functions (e.g. result = some_function(x, y)) +- Familiarity with numpy would be helpful, as only a very brief introduction + is provided. +- Familiarity with matplotlib would be helpful, as only a very brief + introduction is provided. + + + diff --git a/doc/source/workshop.rst b/doc/source/tutorials/wrf_workshop_2017.rst similarity index 100% rename from doc/source/workshop.rst rename to doc/source/tutorials/wrf_workshop_2017.rst diff --git a/doc/source/user_api/index.rst b/doc/source/user_api/index.rst index 17b72e0..587d429 100644 --- a/doc/source/user_api/index.rst +++ b/doc/source/user_api/index.rst @@ -74,7 +74,6 @@ the array object to a compiled extension. :toctree: ./generated/ wrf.to_np - Variable Extraction Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -110,7 +109,7 @@ The routines below are used to assist with plotting. wrf.get_pyngl wrf.cartopy_xlim wrf.cartopy_ylim - + Raw Diagnostic Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -154,6 +153,82 @@ sure they are removed before calling these routines. wrf.omega wrf.pw +OpenMP Runtime Library Routines +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The routines below are the OpenMP runtime libraries that have been wrapped +for wrf-python. The entire library (OpenMP 3.x) has been wrapped, but many of +the routines are only useful inside of an OpenMP thread, so they aren't useful +from inside the Python interpreter. Also, the Fortran code in wrf-python is +fairly simple in terms of threading, so features like nested threads aren't +used. The documentation below is split in to the useful OpenMP functions and +the less useful functions. + +The documentation for each routine was taken directly from the +`OpenMP Specification `_. +Read the specification for more details about these routines. + +Useful OpenMP Routines +***************************** + +The routines below are useful when called from within a Python program. These +routines handle setting the number of threads, setting up the scheduler, +and timing. + +It is also important to note that the OpenMP directives within the Fortran +code all specify a runtime scheduler. This means that the user can control +the type of scheduling to use from within their Python application by using the +routines below. + +.. autosummary:: + :nosignatures: + :toctree: ./generated/ + + wrf.omp_enabled + wrf.omp_set_num_threads + wrf.omp_get_max_threads + wrf.omp_get_num_procs + wrf.omp_set_dynamic + wrf.omp_get_dynamic + wrf.omp_set_schedule + wrf.omp_get_schedule + wrf.omp_get_thread_limit + wrf.omp_get_wtime + wrf.omp_get_wtick + +Less Useful OpenMP Routines +******************************* + +The routines below are less useful because wrf-python does not use nested +parallelism and some of the routines are only applicable when called from +within an OpenMP thread. + +.. autosummary:: + :nosignatures: + :toctree: ./generated/ + + wrf.omp_get_num_threads + wrf.omp_get_thread_num + wrf.omp_in_parallel + wrf.omp_set_nested + wrf.omp_get_nested + wrf.omp_set_max_active_levels + wrf.omp_get_max_active_levels + wrf.omp_get_level + wrf.omp_get_ancestor_thread_num + wrf.omp_get_team_size + wrf.omp_get_active_level + wrf.omp_in_final + wrf.omp_init_lock + wrf.omp_init_nest_lock + wrf.omp_destroy_lock + wrf.omp_destroy_nest_lock + wrf.omp_set_lock + wrf.omp_set_nest_lock + wrf.omp_unset_lock + wrf.omp_unset_nest_lock + wrf.omp_test_lock + wrf.omp_test_nest_lock Configuration Routines ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/wrf/api.py b/src/wrf/api.py index c774505..660c9d6 100644 --- a/src/wrf/api.py +++ b/src/wrf/api.py @@ -4,7 +4,9 @@ from .config import (xarray_enabled, disable_xarray, enable_xarray, pyngl_enabled, enable_pyngl, disable_pyngl, set_cache_size, get_cache_size, omp_enabled) from .constants import (ALL_TIMES, Constants, ConversionFactors, - ProjectionTypes, default_fill) + ProjectionTypes, default_fill, + OMP_SCHED_STATIC, OMP_SCHED_DYNAMIC, + OMP_SCHED_GUIDED, OMP_SCHED_AUTO) from .destag import destagger from .routines import getvar from .computation import (xy, interp1d, interp2dxy, interpz3d, slp, tk, td, rh, @@ -60,7 +62,8 @@ __all__ += ["xarray_enabled", "disable_xarray", "enable_xarray", "pyngl_enabled", "enable_pyngl", "disable_pyngl", "set_cache_size", "get_cache_size", "omp_enabled"] __all__ += ["ALL_TIMES", "Constants", "ConversionFactors", "ProjectionTypes", - "default_fill"] + "default_fill", "OMP_SCHED_STATIC", "OMP_SCHED_DYNAMIC", + "OMP_SCHED_GUIDED", "OMP_SCHED_AUTO"] __all__ += ["destagger"] __all__ += ["getvar"] __all__ += ["xy", "interp1d", "interp2dxy", "interpz3d", "slp", "tk", "td", diff --git a/src/wrf/config.py b/src/wrf/config.py index adf9705..d22d5de 100644 --- a/src/wrf/config.py +++ b/src/wrf/config.py @@ -4,7 +4,9 @@ from __future__ import (absolute_import, division, print_function, from threading import local import wrapt -from ._wrffortran import fomp_enabled +from ._wrffortran import (fomp_enabled, fomp_set_num_threads, + fomp_set_schedule, fomp_set_dynamic, + omp_constants) _local_config = local() @@ -214,3 +216,10 @@ def omp_enabled(): return True if fomp_enabled() else False + +# Set OpenMP to use 1 thread, static scheduler, and no dynamic +# Note: Using the raw extension functions here to prevent possible +# circular import problems in the future. +fomp_set_num_threads(1) +fomp_set_schedule(omp_constants.fomp_sched_static, 0) +fomp_set_dynamic(False) diff --git a/src/wrf/constants.py b/src/wrf/constants.py index ab6da60..6a6dc90 100755 --- a/src/wrf/constants.py +++ b/src/wrf/constants.py @@ -17,10 +17,10 @@ class Constants(object): for key,val in viewitems(wrf_constants.__dict__): setattr(Constants, key.upper(), np.asscalar(val)) -setattr(Constants, "OMP_SCHED_STATIC", omp_constants.fomp_sched_static) -setattr(Constants, "OMP_SCHED_DYNAMIC", omp_constants.fomp_sched_dynamic) -setattr(Constants, "OMP_SCHED_GUIDED", omp_constants.fomp_sched_guided) -setattr(Constants, "OMP_SCHED_AUTO", omp_constants.fomp_sched_auto) +OMP_SCHED_STATIC = omp_constants.fomp_sched_static +OMP_SCHED_DYNAMIC = omp_constants.fomp_sched_dynamic +OMP_SCHED_GUIDED = omp_constants.fomp_sched_guided +OMP_SCHED_AUTO = omp_constants.fomp_sched_auto class ConversionFactors(object): diff --git a/src/wrf/extension.py b/src/wrf/extension.py index 669d4f2..5747f64 100755 --- a/src/wrf/extension.py +++ b/src/wrf/extension.py @@ -1159,7 +1159,7 @@ def omp_get_nested(): return fomp_get_nested() -def omp_set_schedule(kind, modifier): +def omp_set_schedule(kind, modifier=0): """Set the schedule that is applied when *runtime* is used as schedule kind, by setting the value of the run-sched-var ICV. @@ -1182,7 +1182,8 @@ def omp_set_schedule(kind, modifier): modifier(:obj:`int`): An implementation specific value, depending on the choice for *kind*. This parameter is alternatively named - chunk_size in some OpenMP documentation. + chunk_size in some OpenMP documentation. Default is 0, which + means the OpenMP implementation will use its default value. Returns: diff --git a/src/wrf/util.py b/src/wrf/util.py index e245456..480f4f8 100644 --- a/src/wrf/util.py +++ b/src/wrf/util.py @@ -1249,13 +1249,13 @@ def _build_data_array(wrfnc, varname, timeidx, is_moving_domain, is_multifile, data = data[np.newaxis] attrs = OrderedDict() - for key, val in viewitems(var.__dict__): + for dkey, val in viewitems(var.__dict__): # scipy.io adds these but don't want them - if key in ("data", "_shape", "_size", "_typecode", "_attributes", + if dkey in ("data", "_shape", "_size", "_typecode", "_attributes", "maskandscale", "dimensions"): continue - _key = key if isinstance(key, str) else key.decode() + _dkey = dkey if isinstance(dkey, str) else dkey.decode() if isstr(val): _val = val else: @@ -1264,7 +1264,7 @@ def _build_data_array(wrfnc, varname, timeidx, is_moving_domain, is_multifile, else: _val = val - attrs[_key] = _val + attrs[_dkey] = _val dimnames = var.dimensions[-data.ndim:]