beam-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From al...@apache.org
Subject [beam] branch master updated: Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations. (#7773)
Date Fri, 08 Feb 2019 22:08:43 GMT
This is an automated email from the ASF dual-hosted git repository.

altay pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new a84c5b0  Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3
limitations. (#7773)
a84c5b0 is described below

commit a84c5b08179117005c11ed0559057a6ccff00f8e
Author: tvalentyn <tvalentyn@users.noreply.github.com>
AuthorDate: Fri Feb 8 14:08:29 2019 -0800

    Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations. (#7773)
    
    * Restrict pyvcf dependency to Python 2 and link Jira issues tracking Py3 limitations.
    * Skip all VCF IO tests so that we don't need to install pyvcf on Python 3.
    * Don't import VCF on Python 2.
---
 sdks/python/apache_beam/io/vcfio.py               | 10 ++-
 sdks/python/apache_beam/io/vcfio_test.py          | 91 +----------------------
 sdks/python/container/base_image_requirements.txt |  4 +-
 sdks/python/setup.py                              |  6 +-
 4 files changed, 17 insertions(+), 94 deletions(-)

diff --git a/sdks/python/apache_beam/io/vcfio.py b/sdks/python/apache_beam/io/vcfio.py
index 59e470f..0ce76bd 100644
--- a/sdks/python/apache_beam/io/vcfio.py
+++ b/sdks/python/apache_beam/io/vcfio.py
@@ -23,7 +23,9 @@ The 4.2 spec is available at https://samtools.github.io/hts-specs/VCFv4.2.pdf.
 from __future__ import absolute_import
 
 import logging
+import sys
 import traceback
+import warnings
 from builtins import next
 from builtins import object
 from collections import namedtuple
@@ -32,8 +34,6 @@ from future.utils import iteritems
 from past.builtins import long
 from past.builtins import unicode
 
-import vcf
-
 from apache_beam.coders import coders
 from apache_beam.io import filebasedsource
 from apache_beam.io.filesystem import CompressionTypes
@@ -41,6 +41,12 @@ from apache_beam.io.iobase import Read
 from apache_beam.io.textio import _TextSource as TextSource
 from apache_beam.transforms import PTransform
 
+if sys.version_info[0] < 3:
+  import vcf
+else:
+  warnings.warn("VCF IO will support Python 3 after migration to Nucleus, "
+                "see: BEAM-5628.")
+
 
 __all__ = ['ReadFromVcf', 'Variant', 'VariantCall', 'VariantInfo',
            'MalformedVcfRecord']
diff --git a/sdks/python/apache_beam/io/vcfio_test.py b/sdks/python/apache_beam/io/vcfio_test.py
index b3f912b..9a4b793 100644
--- a/sdks/python/apache_beam/io/vcfio_test.py
+++ b/sdks/python/apache_beam/io/vcfio_test.py
@@ -93,6 +93,9 @@ def _count_equals_to(expected_count):
   return _count_equal
 
 
+@unittest.skipIf(sys.version_info[0] == 3,
+                 'VCF io will be ported to Python 3 after switch to Nucleus. '
+                 'See BEAM-5628')
 class VcfSourceTest(unittest.TestCase):
 
   # Distribution should skip tests that need VCF files due to large size
@@ -229,10 +232,6 @@ class VcfSourceTest(unittest.TestCase):
 
     return (malformed_vcf_records, malformed_header_lines)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_sort_variants(self):
     sorted_variants = [
         Variant(reference_name='a', start=20, end=22),
@@ -244,10 +243,6 @@ class VcfSourceTest(unittest.TestCase):
     for permutation in permutations(sorted_variants):
       self.assertEqual(sorted(permutation), sorted_variants)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_variant_equality(self):
     base_variant = Variant(reference_name='a', start=20, end=22,
                            reference_bases='a', alternate_bases=['g', 't'],
@@ -274,10 +269,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertNotEqual(base_variant, missing_field)
 
   @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing')
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_read_single_file_large(self):
     test_data_conifgs = [
         {'file': 'valid-4.0.vcf', 'num_records': 5},
@@ -292,10 +283,6 @@ class VcfSourceTest(unittest.TestCase):
       self.assertEqual(config['num_records'], len(read_data))
 
   @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing')
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_read_file_pattern_large(self):
     read_data = self._read_records(
         os.path.join(get_full_dir(), 'valid-*.vcf'))
@@ -304,10 +291,6 @@ class VcfSourceTest(unittest.TestCase):
         os.path.join(get_full_dir(), 'valid-*.vcf.gz'))
     self.assertEqual(9900, len(read_data_gz))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_single_file_no_records(self):
     self.assertEqual(
         [], self._create_temp_file_and_read_records(['']))
@@ -316,10 +299,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(
         [], self._create_temp_file_and_read_records(_SAMPLE_HEADER_LINES))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_single_file_verify_details(self):
     variant_1, vcf_line_1 = self._get_sample_variant_1()
     read_data = self._create_temp_file_and_read_records(
@@ -333,10 +312,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(3, len(read_data))
     self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_file_pattern_verify_details(self):
     variant_1, vcf_line_1 = self._get_sample_variant_1()
     variant_2, vcf_line_2 = self._get_sample_variant_2()
@@ -351,10 +326,6 @@ class VcfSourceTest(unittest.TestCase):
       self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
 
   @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing')
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_read_after_splitting(self):
     file_name = get_full_file_path('valid-4.1-large.vcf')
     source = VcfSource(file_name)
@@ -369,10 +340,6 @@ class VcfSourceTest(unittest.TestCase):
       split_records.extend(source_test_utils.read_from_source(*source_info))
     self.assertEqual(9882, len(split_records))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_invalid_file(self):
     invalid_file_contents = self._get_invalid_file_contents()
     for content in chain(*invalid_file_contents):
@@ -384,10 +351,6 @@ class VcfSourceTest(unittest.TestCase):
         self._create_temp_vcf_file(content, tempdir)
       self._read_records(os.path.join(tempdir.get_path(), '*.vcf'))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_allow_malformed_records(self):
     invalid_records, invalid_headers = self._get_invalid_file_contents()
 
@@ -406,10 +369,6 @@ class VcfSourceTest(unittest.TestCase):
         self._read_records(self._create_temp_vcf_file(content, tempdir),
                            allow_malformed_records=True)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_no_samples(self):
     header_line = '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\n'
     record_line = '19	123	.	G	A	.	PASS	AF=0.2'
@@ -422,10 +381,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(1, len(read_data))
     self.assertEqual(expected_variant, read_data[0])
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_no_info(self):
     record_line = 'chr19	123	.	.	.	.	.	.	GT	.	.'
     expected_variant = Variant(reference_name='chr19', start=122, end=123)
@@ -438,10 +393,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(1, len(read_data))
     self.assertEqual(expected_variant, read_data[0])
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_info_numbers_and_types(self):
     info_headers = [
         '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n',
@@ -475,10 +426,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_end_info_key(self):
     phaseset_header_line = (
         '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n')
@@ -497,10 +444,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_custom_phaseset(self):
     phaseset_header_line = (
         '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n')
@@ -524,10 +467,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_format_numbers(self):
     format_headers = [
         '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n',
@@ -551,10 +490,6 @@ class VcfSourceTest(unittest.TestCase):
     self.assertEqual(1, len(read_data))
     self.assertEqual(expected_variant, read_data[0])
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_pipeline_read_single_file(self):
     with TempDir() as tempdir:
       file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES +
@@ -565,10 +500,6 @@ class VcfSourceTest(unittest.TestCase):
       pipeline.run()
 
   @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing')
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_pipeline_read_single_file_large(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromVcf(
@@ -577,10 +508,6 @@ class VcfSourceTest(unittest.TestCase):
     pipeline.run()
 
   @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing')
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_pipeline_read_file_pattern_large(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromVcf(
@@ -588,10 +515,6 @@ class VcfSourceTest(unittest.TestCase):
     assert_that(pcoll, _count_equals_to(9900))
     pipeline.run()
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_read_reentrant_without_splitting(self):
     with TempDir() as tempdir:
       file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES +
@@ -599,10 +522,6 @@ class VcfSourceTest(unittest.TestCase):
       source = VcfSource(file_name)
       source_test_utils.assert_reentrant_reads_succeed((source, None, None))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_read_reentrant_after_splitting(self):
     with TempDir() as tempdir:
       file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES +
@@ -613,10 +532,6 @@ class VcfSourceTest(unittest.TestCase):
       source_test_utils.assert_reentrant_reads_succeed(
           (splits[0].source, splits[0].start_position, splits[0].stop_position))
 
-  @unittest.skipIf(sys.version_info[0] == 3 and
-                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
-                   'VCF io will only be ported after switch to Nucleus '
-                   'See BEAM-5628')
   def test_dynamic_work_rebalancing(self):
     with TempDir() as tempdir:
       file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES +
diff --git a/sdks/python/container/base_image_requirements.txt b/sdks/python/container/base_image_requirements.txt
index 032a9ca..bf311d2 100644
--- a/sdks/python/container/base_image_requirements.txt
+++ b/sdks/python/container/base_image_requirements.txt
@@ -40,7 +40,7 @@ pyarrow==0.11.1
 pydot==1.2.4
 pyparsing==2.3.1
 pytz==2018.4
-pyvcf==0.6.8
+pyvcf==0.6.8;python_version<"3.0"
 pyyaml==3.12
 typing==3.6.1
 
@@ -49,7 +49,7 @@ nose==1.3.7
 
 # GCP extra features
 google-apitools==0.5.26
-googledatastore==7.0.1
+googledatastore==7.0.1;python_version<"3.0"
 google-cloud-pubsub==0.39.0
 google-cloud-bigquery==1.6.0
 proto-google-cloud-datastore-v1==0.90.4
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index c44c97e..8b23605 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -116,12 +116,13 @@ REQUIRED_PACKAGES = [
     'oauth2client>=2.0.1,<4',
     # grpcio 1.8.1 and above requires protobuf 3.5.0.post1.
     'protobuf>=3.5.0.post1,<4',
-    # pyarrow is not supported on Windows for Python 2 [BEAM-6287]
+    # [BEAM-6287] pyarrow is not supported on Windows for Python 2
     ('pyarrow>=0.11.1,<0.12.0; python_version >= "3.0" or '
      'platform_system != "Windows"'),
     'pydot>=1.2.0,<1.3',
     'pytz>=2018.3',
-    'pyvcf>=0.6.8,<0.7.0',
+    # [BEAM-5628] Beam VCF IO is not supported in Python 3.
+    'pyvcf>=0.6.8,<0.7.0; python_version < "3.0"',
     'pyyaml>=3.12,<4.0.0',
     'typing>=3.6.0,<3.7.0; python_version < "3.5.0"',
     ]
@@ -139,6 +140,7 @@ GCP_REQUIREMENTS = [
     # google-apitools 0.5.23 and above has important Python 3 supports.
     'google-apitools>=0.5.26,<0.5.27',
     'proto-google-cloud-datastore-v1>=0.90.0,<=0.90.4',
+    # [BEAM-4543] Datastore IO is not supported in Python 3.
     'googledatastore>=7.0.1,<7.1; python_version < "3.0"',
     'google-cloud-pubsub==0.39.0',
     # GCP packages required by tests


Mime
View raw message